undefinedStrings, is now returned that holds field values that are judged to be undefined strings.
i.e. they are a non-numeric value that is not defined in a @string{...} entry and not enclosed by braces or double-quotes.
This array will be empty unless the following condition is met:
($this->removeDelimit || $this->expandMacro && $this->fieldExtract)
24/04/2006 Esteban Zimanyi
- When an undefined string is found in function removeDelimiters return the empty string
- Return $this->undefinedStrings in the last position to allow compatibility with previous versions
- Fix management of preamble in function returnArrays
*/
// For a quick command-line test (php -f PARSEENTRIES.php) after installation, uncomment these lines:
require_once(drupal_get_path('module', 'biblio') . '/biblio.contributors.inc');
/*************************
// Parse a file
$parse = NEW PARSEENTRIES();
$parse->expandMacro = TRUE;
// $array = array("RMP" =>"Rev., Mod. Phys.");
// $parse->loadStringMacro($array);
// $parse->removeDelimit = FALSE;
// $parse->fieldExtract = FALSE;
$parse->openBib("bib.bib");
$parse->extractEntries();
$parse->closeBib();
list($preamble, $strings, $entries, $undefinedStrings) = $parse->returnArrays();
print_r($preamble);
print "\n";
print_r($strings);
print "\n";
print_r($entries);
print "\n\n";
*************************/
/************************
// Parse a bibtex PHP string
$bibtex_data = <<< END
@STRING{three = "THREE"}
@STRING{two = "TWO"}
@string{JRNL23 = {NatLA 23 } # " " # two # " " # three}
@article{klitzing.1,
author = "v. Klitzing and Dorda and Pepper",
title = "New method for high mark@sirfragalot.com accuracy determination of fine structure constant based on quantized hall resistance",
volume = "45",
journal = {Journal of } # JRNL23,
pages = "494",
citeulike-article-id = {12222
}
,
ignoreMe = {blah}, }
@article
{
klitzing.2,
author = "Klaus von Klitzing",
title = "The Quantized Hall Effect",
volume = "58",
journal = two,
pages = "519",
}
END;
$parse = NEW PARSEENTRIES();
$parse->expandMacro = TRUE;
// $parse->removeDelimit = FALSE;
// $parse->fieldExtract = FALSE;
$array = array("RMP" =>"Rev., Mod. Phys.");
$parse->loadStringMacro($array);
$parse->loadBibtexString($bibtex_data);
$parse->extractEntries();
list($preamble, $strings, $entries, $undefinedStrings) = $parse->returnArrays();
print_r($preamble);
print "\n";
print_r($strings);
print "\n";
print_r($entries);
print "\n\n";
**********************/
class PARSEENTRIES
{
function PARSEENTRIES()
{
require_once(drupal_get_path('module', 'biblio') . '/bibtexParse/transtab_latex_unicode.inc.php');
$this->preamble = $this->strings = $this->undefinedStrings = $this->entries = array();
$this->count = 0;
$this->fieldExtract = TRUE;
$this->removeDelimit = TRUE;
$this->expandMacro = FALSE;
$this->parseFile = TRUE;
$this->outsideEntry = TRUE;
}
// Open bib file
function openBib($file)
{
if(!is_file($file))
die;
$this->fid = fopen ($file,'r');
$this->parseFile = TRUE;
}
// Load a bibtex string to parse it
function loadBibtexString($bibtex_string)
{
if(is_string($bibtex_string)) {
$bibtex_string = $this->searchReplaceText($this->transtab_latex_unicode, $bibtex_string, false);
$this->bibtexString = explode("\n",$bibtex_string);
} else {
$this->bibtexString = $bibtex_string;
}
$this->parseFile = FALSE;
$this->currentLine = 0;
}
function searchReplaceText($searchReplaceActionsArray, $sourceString, $includesSearchPatternDelimiters=FALSE)
{
// apply the search & replace actions defined in '$searchReplaceActionsArray' to the text passed in '$sourceString':
foreach ($searchReplaceActionsArray as $searchString => $replaceString)
{
if (!$includesSearchPatternDelimiters)
$searchString = "/" . $searchString . "/"; // add search pattern delimiters
if (preg_match($searchString, $sourceString))
$sourceString = preg_replace($searchString, $replaceString, $sourceString);
}
return $sourceString;
}
// Set strings macro
function loadStringMacro($macro_array)
{
$this->userStrings = $macro_array;
}
// Close bib file
function closeBib()
{
fclose($this->fid);
}
// Get a non-empty line from the bib file or from the bibtexString
function getLine()
{
if($this->parseFile)
{
if(!feof($this->fid))
{
do
{
$line = trim(fgets($this->fid));
}
while(!feof($this->fid) && !$line);
return $line;
}
return FALSE;
}
else
{
do
{
$line = trim($this->bibtexString[$this->currentLine]);
$this->currentLine++;
}
while($this->currentLine < count($this->bibtexString) && !$line);
return $line;
}
}
// Extract value part of @string field enclosed by double-quotes or braces.
// The string may be expanded with previously-defined strings
function extractStringValue($string)
{
// $string contains a end delimiter, remove it
$string = trim(substr($string,0,strlen($string)-1));
// remove delimiters and expand
$string = $this->removeDelimitersAndExpand($string);
return $string;
}
// Extract a field
function fieldSplit($seg)
{
// echo "**** ";print_r($seg);echo "
";
// handle fields like another-field = {}
$array = preg_split("/,\s*([-_.:,a-zA-Z0-9]+)\s*={1}\s*/U", $seg, PREG_SPLIT_DELIM_CAPTURE);
// echo "**** ";print_r($array);echo "
";
//$array = preg_split("/,\s*(\w+)\s*={1}\s*/U", $seg, PREG_SPLIT_DELIM_CAPTURE);
if(!array_key_exists(1, $array))
return array($array[0], FALSE);
return array($array[0], $array[1]);
}
// Extract and format fields
function reduceFields($oldString)
{
// 03/05/2005 G. Gardey. Do not remove all occurences, juste one
// * correctly parse an entry ended by: somefield = {aValue}}
$lg = strlen($oldString);
if($oldString[$lg-1] == "}" || $oldString[$lg-1] == ")" || $oldString[$lg-1] == ",")
$oldString = substr($oldString,0,$lg-1);
// $oldString = rtrim($oldString, "}),");
$split = preg_split("/=/", $oldString, 2);
$string = $split[1];
while($string)
{
list($entry, $string) = $this->fieldSplit($string);
$values[] = $entry;
}
foreach($values as $value)
{
$pos = strpos($oldString, $value);
$oldString = substr_replace($oldString, '', $pos, strlen($value));
}
$rev = strrev(trim($oldString));
if($rev{0} != ',')
$oldString .= ',';
$keys = preg_split("/=,/", $oldString);
// 22/08/2004 - Mark Grimshaw
// I have absolutely no idea why this array_pop is required but it is. Seems to always be
// an empty key at the end after the split which causes problems if not removed.
array_pop($keys);
foreach($keys as $key)
{
$value = trim(array_shift($values));
$rev = strrev($value);
// remove any dangling ',' left on final field of entry
if($rev{0} == ',')
$value = rtrim($value, ",");
if(!$value)
continue;
// 21/08/2004 G.Gardey -> expand macro
// Don't remove delimiters now needs to know if the value is a string macro
// $this->entries[$this->count][strtolower(trim($key))] = trim($this->removeDelimiters(trim($value)));
$key = strtolower(trim($key));
$value = trim($value);
$this->entries[$this->count][$key] = $value;
}
// echo "**** ";print_r($this->entries[$this->count]);echo "
";
}
// Start splitting a bibtex entry into component fields.
// Store the entry type and citation.
function fullSplit($entry)
{
$matches = preg_split("/@(.*)[{(](.*),/U", $entry, 2, PREG_SPLIT_DELIM_CAPTURE);
$this->entries[$this->count]['bibtexEntryType'] = strtolower(trim($matches[1]));
// sometimes a bibtex entry will have no citation key
if(preg_match("/=/", $matches[2])) // this is a field
$matches = preg_split("/@(.*)\s*[{(](.*)/U", $entry, 2, PREG_SPLIT_DELIM_CAPTURE);
// print_r($matches); print "
";
$this->entries[$this->count]['bibtexCitation'] = $matches[2];
$this->reduceFields($matches[3]);
}
// Grab a complete bibtex entry
function parseEntry($entry)
{
$count = 0;
$lastLine = FALSE;
if(preg_match("/@(.*)([{(])/U", preg_quote($entry), $matches))
{
if(!array_key_exists(1, $matches))
return $lastLine;
if(preg_match("/string/i", trim($matches[1])))
$this->strings[] = $entry;
else if(preg_match("/preamble/i", trim($matches[1])))
$this->preamble[] = $entry;
else if(preg_match("/comment/i", $matches[1])); // MG (31/Jan/2006) -- ignore @comment
else
{
if($this->fieldExtract)
$this->fullSplit($entry);
else
$this->entries[$this->count] = $entry;
$this->count++;
}
return $lastLine;
}
}
// Remove delimiters from a string
function removeDelimiters($string)
{
if($string && ($string{0} == "\""))
{
$string = substr($string, 1);
$string = substr($string, 0, -1);
}
else if($string && ($string{0} == "{"))
{
if(strlen($string) > 0 && $string[strlen($string)-1] == "}")
{
$string = substr($string, 1);
$string = substr($string, 0, -1);
}
}
else if(!is_numeric($string) && !array_key_exists($string, $this->strings)
&& (array_search($string, $this->undefinedStrings) === FALSE))
{
$this->undefinedStrings[] = $string; // Undefined string that is not a year etc.
return '';
}
return $string;
}
// This function works like explode('#',$val) but has to take into account whether
// the character # is part of a string (i.e., is enclosed into "..." or {...} )
// or defines a string concatenation as in @string{ "x # x" # ss # {xx{x}x} }
function explodeString($val)
{
$openquote = $bracelevel = $i = $j = 0;
while ($i < strlen($val))
{
if ($val[$i] == '"')
$openquote = !$openquote;
elseif ($val[$i] == '{')
$bracelevel++;
elseif ($val[$i] == '}')
$bracelevel--;
elseif ( $val[$i] == '#' && !$openquote && !$bracelevel )
{
$strings[] = substr($val,$j,$i-$j);
$j=$i+1;
}
$i++;
}
$strings[] = substr($val,$j);
return $strings;
}
// This function receives a string and a closing delimiter '}' or ')'
// and looks for the position of the closing delimiter taking into
// account the following Bibtex rules:
// * Inside the braces, there can arbitrarily nested pairs of braces,
// but braces must also be balanced inside quotes!
// * Inside quotes, to place the " character it is not sufficient
// to simply escape with \": Quotes must be placed inside braces.
function closingDelimiter($val,$delimitEnd)
{
// echo "####>$delimitEnd $val
";
$openquote = $bracelevel = $i = $j = 0;
while ($i < strlen($val))
{
// a '"' found at brace level 0 defines a value such as "ss{\"o}ss"
if ($val[$i] == '"' && !$bracelevel)
$openquote = !$openquote;
elseif ($val[$i] == '{')
$bracelevel++;
elseif ($val[$i] == '}')
$bracelevel--;
if ( $val[$i] == $delimitEnd && !$openquote && !$bracelevel )
return $i;
$i++;
}
// echo "--> $bracelevel, $openquote";
return 0;
}
// Remove enclosures around entry field values. Additionally, expand macros if flag set.
function removeDelimitersAndExpand($string, $inpreamble = FALSE)
{
// only expand the macro if flag set, if strings defined and not in preamble
if(!$this->expandMacro || empty($this->strings) || $inpreamble)
$string = $this->removeDelimiters($string);
else
{
$stringlist = $this->explodeString($string);
$string = "";
foreach ($stringlist as $str)
{
// trim the string since usually # is enclosed by spaces
$str = trim($str);
// replace the string if macro is already defined
// strtolower is used since macros are case insensitive
if (isset($this->strings[strtolower($str)]))
$string .= $this->strings[strtolower($str)];
else
$string .= $this->removeDelimiters(trim($str));
}
}
return $string;
}
// This function extract entries taking into account how comments are defined in BibTeX.
// BibTeX splits the file in two areas: inside an entry and outside an entry, the delimitation
// being indicated by the presence of a @ sign. When this character is met, BibTex expects to
// find an entry. Before that sign, and after an entry, everything is considered a comment!
function extractEntries()
{
$inside = $possibleEntryStart = FALSE;
$entry="";
while($line=$this->getLine())
{
if($possibleEntryStart)
$line = $possibleEntryStart . $line;
if (!$inside && strchr($line,"@"))
{
// throw all characters before the '@'
$line=strstr($line,'@');
if(!strchr($line, "{") && !strchr($line, "("))
$possibleEntryStart = $line;
elseif(preg_match("/@.*([{(])/U", preg_quote($line), $matches))
{
$inside = TRUE;
if ($matches[1] == '{')
$delimitEnd = '}';
else
$delimitEnd = ')';
$possibleEntryStart = FALSE;
}
}
if ($inside)
{
$entry .= " ".$line;
if ($j=$this->closingDelimiter($entry,$delimitEnd))
{
// all characters after the delimiter are thrown but the remaining
// characters must be kept since they may start the next entry !!!
$lastLine = substr($entry,$j+1);
$entry = substr($entry,0,$j+1);
// Strip excess whitespaces from the entry
$entry = preg_replace('/\s\s+/', ' ', $entry);
$this->parseEntry($entry);
$entry = strchr($lastLine,"@");
if ($entry)
$inside = TRUE;
else
$inside = FALSE;
}
}
}
}
// Return arrays of entries etc. to the calling process.
function returnArrays()
{
global $transtab_latex_unicode; // defined in 'transtab_latex_unicode.inc.php'
foreach($this->preamble as $value)
{
preg_match("/.*?[{(](.*)/", $value, $matches);
$preamble = substr($matches[1], 0, -1);
$preambles['bibtexPreamble'] = trim($this->removeDelimitersAndExpand(trim($preamble), TRUE));
}
if(isset($preambles))
$this->preamble = $preambles;
if($this->fieldExtract)
{
// Next lines must take into account strings defined by previously-defined strings
$strings = $this->strings;
// $this->strings is initialized with strings provided by user if they exists
// it is supposed that there are no substitutions to be made in the user strings, i.e., no #
$this->strings = isset($this->userStrings) ? $this->userStrings : array() ;
foreach($strings as $value)
{
// changed 21/08/2004 G. Gardey
// 23/08/2004 Mark G. account for comments on same line as @string - count delimiters in string value
$value = trim($value);
$matches = preg_split("/@\s*string\s*([{(])/i", $value, 2, PREG_SPLIT_DELIM_CAPTURE);
$delimit = $matches[1];
$matches = preg_split("/=/", $matches[2], 2, PREG_SPLIT_DELIM_CAPTURE);
// macros are case insensitive
$this->strings[strtolower(trim($matches[0]))] = $this->extractStringValue($matches[1]);
}
}
// changed 21/08/2004 G. Gardey
// 22/08/2004 Mark Grimshaw - stopped useless looping.
// removeDelimit and expandMacro have NO effect if !$this->fieldExtract
if($this->removeDelimit || $this->expandMacro && $this->fieldExtract)
{
for($i = 0; $i < count($this->entries); $i++)
{
foreach($this->entries[$i] as $key => $value)
// 02/05/2005 G. Gardey don't expand macro for bibtexCitation
// and bibtexEntryType
if($key != 'bibtexCitation' && $key != 'bibtexEntryType')
$this->entries[$i][$key] = trim($this->removeDelimitersAndExpand($this->entries[$i][$key]));
}
}
// EZ: Remove this to be able to use the same instance for parsing several files,
// e.g., parsing a entry file with its associated abbreviation file
// if(empty($this->preamble))
// $this->preamble = FALSE;
// if(empty($this->strings))
// $this->strings = FALSE;
// if(empty($this->entries))
// $this->entries = FALSE;
return array($this->preamble, $this->strings, $this->entries, $this->undefinedStrings);
}
function bib2node(&$node_array, $node){
list($preamble, $strings, $entries, $undefinedStrings) = $this->returnArrays();
foreach($entries as $entry){
$node_id = array_push($node_array, $node) - 1;
$node_array[$node_id]['biblio_contributors'] = array();
switch ($entry['bibtexEntryType']){
case article:
$node_array[$node_id]['biblio_type'] = 102;
break;
case book:
$node_array[$node_id]['biblio_type'] = 100;
break;
case booklet:
case inbook:
$node_array[$node_id]['biblio_type'] = 101;
break;
case conference:
$node_array[$node_id]['biblio_type'] = 103;
break;
case incollection:
$node_array[$node_id]['biblio_type'] = 100;
break;
case inproceedings:
$node_array[$node_id]['biblio_type'] = 103;
break;
case manual:
$node_array[$node_id]['biblio_type'] = 129;
break;
case mastersthesis:
$node_array[$node_id]['biblio_type'] = 108;
break;
case misc:
$node_array[$node_id]['biblio_type'] = 129;
break;
case phdthesis:
$node_array[$node_id]['biblio_type'] = 108;
break;
case proceedings:
$node_array[$node_id]['biblio_type'] = 104;
break;
case techreport:
$node_array[$node_id]['biblio_type'] = 109;
break;
case unpublished:
$node_array[$node_id]['biblio_type'] = 124;
break;
}
if (!empty($entry['author'])){
// split on ' and '
$authorArray = preg_split("/\s(and|&)\s/i", trim($entry['author']));
foreach ($authorArray as $key => $author)
{
$node_array[$node_id]['biblio_contributors'][]= array('name' => $author, 'ctid' => 1);
}
}
if (!empty($entry['bibtexCitation'])) $node_array[$node_id]['biblio_citekey'] = $entry['bibtexCitation'];
if (!empty($entry['editor']))
{
$authorArray = preg_split("/\s(and|&)\s/i", trim($entry['editor']));
foreach ($authorArray as $key => $author)
{
$node_array[$node_id]['biblio_contributors'][]= array('name' => $author, 'ctid' => 2);
}
}
if (!empty($entry['journal']))$node_array[$node_id]['biblio_secondary_title'] = $entry['journal'];
if (!empty($entry['booktitle']))$node_array[$node_id]['biblio_secondary_title'] = $entry['booktitle'];
if (!empty($entry['series']))$node_array[$node_id]['biblio_secondary_title'] = $entry['series'];
if (!empty($entry['volume'])) $node_array[$node_id]['biblio_volume'] = $entry['volume'];
if (!empty($entry['number'])) $node_array[$node_id]['biblio_number'] = $entry['number'];
if (!empty($entry['year'])) $node_array[$node_id]['biblio_year'] = $entry['year'];
if (!empty($entry['note'])) $node_array[$node_id]['biblio_notes'] = $entry['note'];
if (!empty($entry['month'])) $node_array[$node_id]['biblio_date'] = $entry['month'];
if (!empty($entry['pages'])) $node_array[$node_id]['biblio_pages'] = $entry['pages'];
if (!empty($entry['publisher'])) $node_array[$node_id]['biblio_publisher'] = $entry['publisher'];
if (!empty($entry['organization'])) $node_array[$node_id]['biblio_publisher'] = $entry['organization'];
if (!empty($entry['school'])) $node_array[$node_id]['biblio_publisher'] = $entry['school'];
if (!empty($entry['institution'])) $node_array[$node_id]['biblio_publisher'] = $entry['institution'];
if (!empty($entry['title'])) $node_array[$node_id]['title'] = $entry['title'];
if (!empty($entry['type'])) $node_array[$node_id]['biblio_type_of_work'] = $entry['type'];
if (!empty($entry['edition'])) $node_array[$node_id]['biblio_edition'] = $entry['edition'];
if (!empty($entry['chapter'])) $node_array[$node_id]['biblio_section'] = $entry['chapter'];
if (!empty($entry['address'])) $node_array[$node_id]['biblio_place_published'] = $entry['address'];
if (!empty($entry['abstract'])) $node_array[$node_id]['biblio_abst_e'] = $entry['abstract'];
if (!empty($entry['keywords'])) $node_array[$node_id]['biblio_keywords'] = $entry['keywords'];
if (!empty($entry['isbn'])) $node_array[$node_id]['biblio_isbn'] = $entry['isbn'];
if (!empty($entry['url'])) $node_array[$node_id]['biblio_url'] = $entry['url'];
}
}
}