undefinedStrings, is now returned that holds field values that are judged to be undefined strings. i.e. they are a non-numeric value that is not defined in a @string{...} entry and not enclosed by braces or double-quotes. This array will be empty unless the following condition is met: ($this->removeDelimit || $this->expandMacro && $this->fieldExtract) 24/04/2006 Esteban Zimanyi - When an undefined string is found in function removeDelimiters return the empty string - Return $this->undefinedStrings in the last position to allow compatibility with previous versions - Fix management of preamble in function returnArrays */ // For a quick command-line test (php -f PARSEENTRIES.php) after installation, uncomment these lines: require_once(drupal_get_path('module', 'biblio') . '/biblio.contributors.inc'); /************************* // Parse a file $parse = NEW PARSEENTRIES(); $parse->expandMacro = TRUE; // $array = array("RMP" =>"Rev., Mod. Phys."); // $parse->loadStringMacro($array); // $parse->removeDelimit = FALSE; // $parse->fieldExtract = FALSE; $parse->openBib("bib.bib"); $parse->extractEntries(); $parse->closeBib(); list($preamble, $strings, $entries, $undefinedStrings) = $parse->returnArrays(); print_r($preamble); print "\n"; print_r($strings); print "\n"; print_r($entries); print "\n\n"; *************************/ /************************ // Parse a bibtex PHP string $bibtex_data = <<< END @STRING{three = "THREE"} @STRING{two = "TWO"} @string{JRNL23 = {NatLA 23 } # " " # two # " " # three} @article{klitzing.1, author = "v. Klitzing and Dorda and Pepper", title = "New method for high mark@sirfragalot.com accuracy determination of fine structure constant based on quantized hall resistance", volume = "45", journal = {Journal of } # JRNL23, pages = "494", citeulike-article-id = {12222 } , ignoreMe = {blah}, } @article { klitzing.2, author = "Klaus von Klitzing", title = "The Quantized Hall Effect", volume = "58", journal = two, pages = "519", } END; $parse = NEW PARSEENTRIES(); $parse->expandMacro = TRUE; // $parse->removeDelimit = FALSE; // $parse->fieldExtract = FALSE; $array = array("RMP" =>"Rev., Mod. Phys."); $parse->loadStringMacro($array); $parse->loadBibtexString($bibtex_data); $parse->extractEntries(); list($preamble, $strings, $entries, $undefinedStrings) = $parse->returnArrays(); print_r($preamble); print "\n"; print_r($strings); print "\n"; print_r($entries); print "\n\n"; **********************/ class PARSEENTRIES { function PARSEENTRIES() { require_once(drupal_get_path('module', 'biblio') . '/bibtexParse/transtab_latex_unicode.inc.php'); $this->preamble = $this->strings = $this->undefinedStrings = $this->entries = array(); $this->count = 0; $this->fieldExtract = TRUE; $this->removeDelimit = TRUE; $this->expandMacro = FALSE; $this->parseFile = TRUE; $this->outsideEntry = TRUE; } // Open bib file function openBib($file) { if(!is_file($file)) die; $this->fid = fopen ($file,'r'); $this->parseFile = TRUE; } // Load a bibtex string to parse it function loadBibtexString($bibtex_string) { if(is_string($bibtex_string)) { $bibtex_string = $this->searchReplaceText($this->transtab_latex_unicode, $bibtex_string, false); $this->bibtexString = explode("\n",$bibtex_string); } else { $this->bibtexString = $bibtex_string; } $this->parseFile = FALSE; $this->currentLine = 0; } function searchReplaceText($searchReplaceActionsArray, $sourceString, $includesSearchPatternDelimiters=FALSE) { // apply the search & replace actions defined in '$searchReplaceActionsArray' to the text passed in '$sourceString': foreach ($searchReplaceActionsArray as $searchString => $replaceString) { if (!$includesSearchPatternDelimiters) $searchString = "/" . $searchString . "/"; // add search pattern delimiters if (preg_match($searchString, $sourceString)) $sourceString = preg_replace($searchString, $replaceString, $sourceString); } return $sourceString; } // Set strings macro function loadStringMacro($macro_array) { $this->userStrings = $macro_array; } // Close bib file function closeBib() { fclose($this->fid); } // Get a non-empty line from the bib file or from the bibtexString function getLine() { if($this->parseFile) { if(!feof($this->fid)) { do { $line = trim(fgets($this->fid)); } while(!feof($this->fid) && !$line); return $line; } return FALSE; } else { do { $line = trim($this->bibtexString[$this->currentLine]); $this->currentLine++; } while($this->currentLine < count($this->bibtexString) && !$line); return $line; } } // Extract value part of @string field enclosed by double-quotes or braces. // The string may be expanded with previously-defined strings function extractStringValue($string) { // $string contains a end delimiter, remove it $string = trim(substr($string,0,strlen($string)-1)); // remove delimiters and expand $string = $this->removeDelimitersAndExpand($string); return $string; } // Extract a field function fieldSplit($seg) { // echo "**** ";print_r($seg);echo "
"; // handle fields like another-field = {} $array = preg_split("/,\s*([-_.:,a-zA-Z0-9]+)\s*={1}\s*/U", $seg, PREG_SPLIT_DELIM_CAPTURE); // echo "**** ";print_r($array);echo "
"; //$array = preg_split("/,\s*(\w+)\s*={1}\s*/U", $seg, PREG_SPLIT_DELIM_CAPTURE); if(!array_key_exists(1, $array)) return array($array[0], FALSE); return array($array[0], $array[1]); } // Extract and format fields function reduceFields($oldString) { // 03/05/2005 G. Gardey. Do not remove all occurences, juste one // * correctly parse an entry ended by: somefield = {aValue}} $lg = strlen($oldString); if($oldString[$lg-1] == "}" || $oldString[$lg-1] == ")" || $oldString[$lg-1] == ",") $oldString = substr($oldString,0,$lg-1); // $oldString = rtrim($oldString, "}),"); $split = preg_split("/=/", $oldString, 2); $string = $split[1]; while($string) { list($entry, $string) = $this->fieldSplit($string); $values[] = $entry; } foreach($values as $value) { $pos = strpos($oldString, $value); $oldString = substr_replace($oldString, '', $pos, strlen($value)); } $rev = strrev(trim($oldString)); if($rev{0} != ',') $oldString .= ','; $keys = preg_split("/=,/", $oldString); // 22/08/2004 - Mark Grimshaw // I have absolutely no idea why this array_pop is required but it is. Seems to always be // an empty key at the end after the split which causes problems if not removed. array_pop($keys); foreach($keys as $key) { $value = trim(array_shift($values)); $rev = strrev($value); // remove any dangling ',' left on final field of entry if($rev{0} == ',') $value = rtrim($value, ","); if(!$value) continue; // 21/08/2004 G.Gardey -> expand macro // Don't remove delimiters now needs to know if the value is a string macro // $this->entries[$this->count][strtolower(trim($key))] = trim($this->removeDelimiters(trim($value))); $key = strtolower(trim($key)); $value = trim($value); $this->entries[$this->count][$key] = $value; } // echo "**** ";print_r($this->entries[$this->count]);echo "
"; } // Start splitting a bibtex entry into component fields. // Store the entry type and citation. function fullSplit($entry) { $matches = preg_split("/@(.*)[{(](.*),/U", $entry, 2, PREG_SPLIT_DELIM_CAPTURE); $this->entries[$this->count]['bibtexEntryType'] = strtolower(trim($matches[1])); // sometimes a bibtex entry will have no citation key if(preg_match("/=/", $matches[2])) // this is a field $matches = preg_split("/@(.*)\s*[{(](.*)/U", $entry, 2, PREG_SPLIT_DELIM_CAPTURE); // print_r($matches); print "

"; $this->entries[$this->count]['bibtexCitation'] = $matches[2]; $this->reduceFields($matches[3]); } // Grab a complete bibtex entry function parseEntry($entry) { $count = 0; $lastLine = FALSE; if(preg_match("/@(.*)([{(])/U", preg_quote($entry), $matches)) { if(!array_key_exists(1, $matches)) return $lastLine; if(preg_match("/string/i", trim($matches[1]))) $this->strings[] = $entry; else if(preg_match("/preamble/i", trim($matches[1]))) $this->preamble[] = $entry; else if(preg_match("/comment/i", $matches[1])); // MG (31/Jan/2006) -- ignore @comment else { if($this->fieldExtract) $this->fullSplit($entry); else $this->entries[$this->count] = $entry; $this->count++; } return $lastLine; } } // Remove delimiters from a string function removeDelimiters($string) { if($string && ($string{0} == "\"")) { $string = substr($string, 1); $string = substr($string, 0, -1); } else if($string && ($string{0} == "{")) { if(strlen($string) > 0 && $string[strlen($string)-1] == "}") { $string = substr($string, 1); $string = substr($string, 0, -1); } } else if(!is_numeric($string) && !array_key_exists($string, $this->strings) && (array_search($string, $this->undefinedStrings) === FALSE)) { $this->undefinedStrings[] = $string; // Undefined string that is not a year etc. return ''; } return $string; } // This function works like explode('#',$val) but has to take into account whether // the character # is part of a string (i.e., is enclosed into "..." or {...} ) // or defines a string concatenation as in @string{ "x # x" # ss # {xx{x}x} } function explodeString($val) { $openquote = $bracelevel = $i = $j = 0; while ($i < strlen($val)) { if ($val[$i] == '"') $openquote = !$openquote; elseif ($val[$i] == '{') $bracelevel++; elseif ($val[$i] == '}') $bracelevel--; elseif ( $val[$i] == '#' && !$openquote && !$bracelevel ) { $strings[] = substr($val,$j,$i-$j); $j=$i+1; } $i++; } $strings[] = substr($val,$j); return $strings; } // This function receives a string and a closing delimiter '}' or ')' // and looks for the position of the closing delimiter taking into // account the following Bibtex rules: // * Inside the braces, there can arbitrarily nested pairs of braces, // but braces must also be balanced inside quotes! // * Inside quotes, to place the " character it is not sufficient // to simply escape with \": Quotes must be placed inside braces. function closingDelimiter($val,$delimitEnd) { // echo "####>$delimitEnd $val
"; $openquote = $bracelevel = $i = $j = 0; while ($i < strlen($val)) { // a '"' found at brace level 0 defines a value such as "ss{\"o}ss" if ($val[$i] == '"' && !$bracelevel) $openquote = !$openquote; elseif ($val[$i] == '{') $bracelevel++; elseif ($val[$i] == '}') $bracelevel--; if ( $val[$i] == $delimitEnd && !$openquote && !$bracelevel ) return $i; $i++; } // echo "--> $bracelevel, $openquote"; return 0; } // Remove enclosures around entry field values. Additionally, expand macros if flag set. function removeDelimitersAndExpand($string, $inpreamble = FALSE) { // only expand the macro if flag set, if strings defined and not in preamble if(!$this->expandMacro || empty($this->strings) || $inpreamble) $string = $this->removeDelimiters($string); else { $stringlist = $this->explodeString($string); $string = ""; foreach ($stringlist as $str) { // trim the string since usually # is enclosed by spaces $str = trim($str); // replace the string if macro is already defined // strtolower is used since macros are case insensitive if (isset($this->strings[strtolower($str)])) $string .= $this->strings[strtolower($str)]; else $string .= $this->removeDelimiters(trim($str)); } } return $string; } // This function extract entries taking into account how comments are defined in BibTeX. // BibTeX splits the file in two areas: inside an entry and outside an entry, the delimitation // being indicated by the presence of a @ sign. When this character is met, BibTex expects to // find an entry. Before that sign, and after an entry, everything is considered a comment! function extractEntries() { $inside = $possibleEntryStart = FALSE; $entry=""; while($line=$this->getLine()) { if($possibleEntryStart) $line = $possibleEntryStart . $line; if (!$inside && strchr($line,"@")) { // throw all characters before the '@' $line=strstr($line,'@'); if(!strchr($line, "{") && !strchr($line, "(")) $possibleEntryStart = $line; elseif(preg_match("/@.*([{(])/U", preg_quote($line), $matches)) { $inside = TRUE; if ($matches[1] == '{') $delimitEnd = '}'; else $delimitEnd = ')'; $possibleEntryStart = FALSE; } } if ($inside) { $entry .= " ".$line; if ($j=$this->closingDelimiter($entry,$delimitEnd)) { // all characters after the delimiter are thrown but the remaining // characters must be kept since they may start the next entry !!! $lastLine = substr($entry,$j+1); $entry = substr($entry,0,$j+1); // Strip excess whitespaces from the entry $entry = preg_replace('/\s\s+/', ' ', $entry); $this->parseEntry($entry); $entry = strchr($lastLine,"@"); if ($entry) $inside = TRUE; else $inside = FALSE; } } } } // Return arrays of entries etc. to the calling process. function returnArrays() { global $transtab_latex_unicode; // defined in 'transtab_latex_unicode.inc.php' foreach($this->preamble as $value) { preg_match("/.*?[{(](.*)/", $value, $matches); $preamble = substr($matches[1], 0, -1); $preambles['bibtexPreamble'] = trim($this->removeDelimitersAndExpand(trim($preamble), TRUE)); } if(isset($preambles)) $this->preamble = $preambles; if($this->fieldExtract) { // Next lines must take into account strings defined by previously-defined strings $strings = $this->strings; // $this->strings is initialized with strings provided by user if they exists // it is supposed that there are no substitutions to be made in the user strings, i.e., no # $this->strings = isset($this->userStrings) ? $this->userStrings : array() ; foreach($strings as $value) { // changed 21/08/2004 G. Gardey // 23/08/2004 Mark G. account for comments on same line as @string - count delimiters in string value $value = trim($value); $matches = preg_split("/@\s*string\s*([{(])/i", $value, 2, PREG_SPLIT_DELIM_CAPTURE); $delimit = $matches[1]; $matches = preg_split("/=/", $matches[2], 2, PREG_SPLIT_DELIM_CAPTURE); // macros are case insensitive $this->strings[strtolower(trim($matches[0]))] = $this->extractStringValue($matches[1]); } } // changed 21/08/2004 G. Gardey // 22/08/2004 Mark Grimshaw - stopped useless looping. // removeDelimit and expandMacro have NO effect if !$this->fieldExtract if($this->removeDelimit || $this->expandMacro && $this->fieldExtract) { for($i = 0; $i < count($this->entries); $i++) { foreach($this->entries[$i] as $key => $value) // 02/05/2005 G. Gardey don't expand macro for bibtexCitation // and bibtexEntryType if($key != 'bibtexCitation' && $key != 'bibtexEntryType') $this->entries[$i][$key] = trim($this->removeDelimitersAndExpand($this->entries[$i][$key])); } } // EZ: Remove this to be able to use the same instance for parsing several files, // e.g., parsing a entry file with its associated abbreviation file // if(empty($this->preamble)) // $this->preamble = FALSE; // if(empty($this->strings)) // $this->strings = FALSE; // if(empty($this->entries)) // $this->entries = FALSE; return array($this->preamble, $this->strings, $this->entries, $this->undefinedStrings); } function bib2node(&$node_array, $node){ list($preamble, $strings, $entries, $undefinedStrings) = $this->returnArrays(); foreach($entries as $entry){ $node_id = array_push($node_array, $node) - 1; $node_array[$node_id]['biblio_contributors'] = array(); switch ($entry['bibtexEntryType']){ case article: $node_array[$node_id]['biblio_type'] = 102; break; case book: $node_array[$node_id]['biblio_type'] = 100; break; case booklet: case inbook: $node_array[$node_id]['biblio_type'] = 101; break; case conference: $node_array[$node_id]['biblio_type'] = 103; break; case incollection: $node_array[$node_id]['biblio_type'] = 100; break; case inproceedings: $node_array[$node_id]['biblio_type'] = 103; break; case manual: $node_array[$node_id]['biblio_type'] = 129; break; case mastersthesis: $node_array[$node_id]['biblio_type'] = 108; break; case misc: $node_array[$node_id]['biblio_type'] = 129; break; case phdthesis: $node_array[$node_id]['biblio_type'] = 108; break; case proceedings: $node_array[$node_id]['biblio_type'] = 104; break; case techreport: $node_array[$node_id]['biblio_type'] = 109; break; case unpublished: $node_array[$node_id]['biblio_type'] = 124; break; } if (!empty($entry['author'])){ // split on ' and ' $authorArray = preg_split("/\s(and|&)\s/i", trim($entry['author'])); foreach ($authorArray as $key => $author) { $node_array[$node_id]['biblio_contributors'][]= array('name' => $author, 'ctid' => 1); } } if (!empty($entry['bibtexCitation'])) $node_array[$node_id]['biblio_citekey'] = $entry['bibtexCitation']; if (!empty($entry['editor'])) { $authorArray = preg_split("/\s(and|&)\s/i", trim($entry['editor'])); foreach ($authorArray as $key => $author) { $node_array[$node_id]['biblio_contributors'][]= array('name' => $author, 'ctid' => 2); } } if (!empty($entry['journal']))$node_array[$node_id]['biblio_secondary_title'] = $entry['journal']; if (!empty($entry['booktitle']))$node_array[$node_id]['biblio_secondary_title'] = $entry['booktitle']; if (!empty($entry['series']))$node_array[$node_id]['biblio_secondary_title'] = $entry['series']; if (!empty($entry['volume'])) $node_array[$node_id]['biblio_volume'] = $entry['volume']; if (!empty($entry['number'])) $node_array[$node_id]['biblio_number'] = $entry['number']; if (!empty($entry['year'])) $node_array[$node_id]['biblio_year'] = $entry['year']; if (!empty($entry['note'])) $node_array[$node_id]['biblio_notes'] = $entry['note']; if (!empty($entry['month'])) $node_array[$node_id]['biblio_date'] = $entry['month']; if (!empty($entry['pages'])) $node_array[$node_id]['biblio_pages'] = $entry['pages']; if (!empty($entry['publisher'])) $node_array[$node_id]['biblio_publisher'] = $entry['publisher']; if (!empty($entry['organization'])) $node_array[$node_id]['biblio_publisher'] = $entry['organization']; if (!empty($entry['school'])) $node_array[$node_id]['biblio_publisher'] = $entry['school']; if (!empty($entry['institution'])) $node_array[$node_id]['biblio_publisher'] = $entry['institution']; if (!empty($entry['title'])) $node_array[$node_id]['title'] = $entry['title']; if (!empty($entry['type'])) $node_array[$node_id]['biblio_type_of_work'] = $entry['type']; if (!empty($entry['edition'])) $node_array[$node_id]['biblio_edition'] = $entry['edition']; if (!empty($entry['chapter'])) $node_array[$node_id]['biblio_section'] = $entry['chapter']; if (!empty($entry['address'])) $node_array[$node_id]['biblio_place_published'] = $entry['address']; if (!empty($entry['abstract'])) $node_array[$node_id]['biblio_abst_e'] = $entry['abstract']; if (!empty($entry['keywords'])) $node_array[$node_id]['biblio_keywords'] = $entry['keywords']; if (!empty($entry['isbn'])) $node_array[$node_id]['biblio_isbn'] = $entry['isbn']; if (!empty($entry['url'])) $node_array[$node_id]['biblio_url'] = $entry['url']; } } }