url); if ($downloaded_string == FALSE) { return FALSE; } if (!defined('LIBXML_VERSION') || (version_compare(phpversion(), '5.1.0', '<'))) { @ $xml = simplexml_load_string($downloaded_string, NULL); } else { @ $xml = simplexml_load_string($downloaded_string, NULL, LIBXML_NOERROR | LIBXML_NOWARNING); } // We got a malformed XML if ($xml === FALSE || $xml == NULL) { return FALSE; } $feed_type = _parser_common_syndication_feed_format_detect($xml); if ($feed_type == "atom1.0") { return _parser_common_syndication_atom10_parse($xml); } if ($feed_type == "RSS2.0" || $feed_type == "RSS0.91" || $feed_type == "RSS0.92") { return _parser_common_syndication_RSS20_parse($xml); } if ($feed_type == "RDF") { return _parser_common_syndication_RDF10_parse($xml); } return FALSE; } /** * Implementation of hook_feedapi_type(). * Define the feed types that this module is able to handle * * @return * The types */ function parser_common_syndication_feedapi_type() { return array("XML feed"); } /** * Determine the feed format of a SimpleXML parsed object structure * * @param $xml * SimpleXML-preprocessed feed * @return * a string - means the feed format */ function _parser_common_syndication_feed_format_detect($xml) { if (!is_object($xml)) { return FALSE; } $attr = $xml->attributes(); //print_r($xml); if (isset($xml->entry) && strtolower($xml->getName()) == "feed") { return "atom1.0"; } if (strtolower($xml->getName()) == "rss" && $attr["version"] == "2.0") { return "RSS2.0"; } if (strtolower($xml->getName()) == "rdf" && isset($xml->channel)) { return "RDF"; } if (strtolower($xml->getName()) == "rss" && $attr["version"] == "0.91") { return "RSS0.91"; } if (strtolower($xml->getName()) == "rss" && $attr["version"] == "0.92") { return "RSS0.92"; } return FALSE; } /** * Call one of the possible feedapi_get hook and pass back the downloaded data * * @return * string - the downloaded data, FALSE - if the URL is not reachable */ function _parser_common_syndication_download($url) { $downloaders = module_implements("feedapi_get"); $downloaded_string = ""; $this_types = parser_common_syndication_feedapi_type(); // Pick one module that can able to download this foreach ($downloaders as $concrete_module) { $types = module_invoke($concrete_module, "feedapi_type"); // If the downloader can get this type of content if (count(array_intersect($this_types, $types)) > 0) { $downloaded_string = module_invoke($concrete_module, "feedapi_get", $url); break; } } // Cannot get the feed, pass the problem to one level upper if ($downloaded_string == "") { return FALSE; } return $downloaded_string; } /** * Parse atom feeds */ function _parser_common_syndication_atom10_parse($feed_XML) { $parsed_source = new stdClass(); // Detect the title $parsed_source->title = isset($feed_XML->title) ? (string) $feed_XML->title : ""; // Detect the description $parsed_source->description = isset($feed_XML->subtitle) ? (string) $feed_XML->subtitle : ""; $parsed_source->options = new stdClass(); // Detect the link $parsed_source->options->link = ""; if (count($feed_XML->link) > 0) { $link = $feed_XML->link; $link = $link->attributes(); $parsed_source->options->link = isset($link["href"]) ? (string) $link["href"] : ""; } $parsed_source->items = array(); foreach ($feed_XML->entry as $news) { $original_url = NULL; if ($news->id) { $guid = "{$news->id}"; } else { $guid = NULL; } // I don't know how standard this is, but sometimes the id is the URL if (valid_url($guid, TRUE)) { $original_url = $guid; } $additional_taxonomies = array(); if ($news->category) { $additional_taxonomies['ATOM Categories'] = array(); foreach ($news->category AS $category) { $additional_taxonomies['ATOM Categories'][] = "{$category['term']}"; } } $title = "{$news->title}"; if ($news->content) { $body = ''; foreach($news->content->children() as $child) { $body .= $child->asXML(); } $body .= "{$news->content}"; } else if ($news->summary) { $body = ''; foreach($news->summary->children() as $child) { $body .= $child->asXML(); } $body .= "{$news->summary}"; } if ($news->content['src']) { // some src elements in some valid atom feeds contained no urls at all if (valid_url("{$news->content['src']}", TRUE)) { $original_url = "{$news->content['src']}"; } } if ($news->summary) { $teaser = ''; foreach($news->summary->children() as $child) { $teaser .= $child->asXML(); } $teaser .= "{$news->summary}"; } else { $teaser = node_teaser($body); } $author_found = FALSE; if ($news->source->author->name) { $original_author = "{$news->source->author->name}"; $author_found = TRUE; } else if ($news->author->name) { $original_author = "{$news->author->name}"; $author_found = TRUE; } if ($feed_XML->author->name && !$author_found) { $original_author = "{$feed_XML->author->name}"; } if ($news->link['href'] && valid_url("{$news->link['href']}", TRUE)) { $original_url = "{$news->link['href']}"; } $timestamp = strtotime("{$news->published}"); if ($timestamp === FALSE) { $timestamp = time(); } $item = new stdClass(); $item->title = $title; $item->description = $body; $item->options = new stdClass(); $item->options->teaser = $teaser; $item->options->original_author = $original_author; $item->options->timestamp = $timestamp; $item->options->original_url = $original_url; $item->options->guid = $guid; $item->options->tags = $additional_taxonomies['ATOM Categories']; //_aggregation_add_item($title, $body, $teaser, $original_author, $feed, $additional_taxonomies, $timestamp, $original_url, $guid, array()); $parsed_source->items[] = $item; } return $parsed_source; } /** * Parse RSS1.0/RDF feeds */ function _parser_common_syndication_RDF10_parse($feed_XML) { $parsed_source = new stdClass(); // Detect the title $parsed_source->title = isset($feed_XML->channel->title) ? (string) $feed_XML->channel->title : ""; // Detect the description $parsed_source->description = isset($feed_XML->channel->description) ? (string) $feed_XML->channel->description : ""; $parsed_source->options = new stdClass(); // Detect the link $parsed_source->options->link = isset($feed_XML->channel->link) ? (string) $feed_XML->channel->link : ""; $parsed_source->items = array(); // set category splitter (space is for del.icio.us feed) $category_splitter = ' '; // get the default original author if ($feed_XML->channel->title) { $oa = (string) $feed_XML->channel->title; } // get all namespaces if (version_compare(phpversion(), '5.1.2', '<')) { //versions prior 5.1.2 don't allow namespaces $namespaces['default'] = NULL; } else { $namespaces = $feed_XML->getNamespaces(TRUE); } foreach ($feed_XML->item as $news) { //initialization $guid = $original_url = NULL; $title = $body = $teaser = ''; $timestamp = time(); $additional_taxonomies = array(); $original_author = $oa; foreach($namespaces as $ns_link) { //get about attribute as guid foreach ($news->attributes($ns_link) as $name => $value) { if ($name == 'about') { $guid = $value; } } //get children for current namespace if (version_compare(phpversion(), '5.1.2', '<')) { $ns = (array)$news; } else { $ns = (array)$news->children($ns_link); } //title if ((string)$ns['title']) { $title = (string)$ns['title']; } //description or dc:description if ((string)$ns['description'] && $body <> '') { $body = (string)$ns['description']; } //link if ((string)$ns['link']) { $original_url = (string)$ns['link']; } //dc:creator if ((string)$ns['creator']) { $original_author = (string)$ns['creator']; } //dc:date if ((string)$ns['date']) { $timestamp = strtotime((string)$ns['date']); } //content:encoded if ((string)$ns['encoded']) { $body = (string)$ns['encoded']; } //dc:subject if ((string)$ns['subject']) { //there can be multiple category tags if (is_array($ns['subject'])) { foreach ($ns['subject'] as $cat) { if (is_object($cat)) { $additional_taxonomies['RDF Categories'][] = trim(strip_tags($cat->asXML())); } else { $additional_taxonomies['RDF Categories'][] = $cat; } } } else { //or single tag $additional_taxonomies['RDF Categories'] = explode($category_splitter, (string)$ns['subject']); } } } // description is not mandatory so use title if description not present if (!$body) { $body = $title; } //make teaser $teaser = node_teaser($body); // if there are no link tag but rdf:about is provided if (!$original_url && $guid) { $original_url = $guid; } $item = new stdClass(); $item->title = $title; $item->description = $body; $item->options = new stdClass(); $item->options->teaser = $teaser; $item->options->original_author = $original_author; $item->options->timestamp = $timestamp; $item->options->original_url = $original_url; $item->options->guid = $guid; $item->options->link = $additional_taxonomies['RDF Categories']; $parsed_source->items[] = $item; } return $parsed_source; } /** * Parse RSS2.0 feeds */ function _parser_common_syndication_RSS20_parse($feed_XML) { $parsed_source = new stdClass(); // Detect the title $parsed_source->title = isset($feed_XML->channel->title) ? (string) $feed_XML->channel->title : ""; // Detect the description $parsed_source->description = isset($feed_XML->channel->description) ? (string) $feed_XML->channel->description : ""; $parsed_source->options = new stdClass(); // Detect the link $parsed_source->options->link = isset($feed_XML->channel->link) ? (string) $feed_XML->channel->link : ""; $parsed_source->items = array(); foreach ($feed_XML->xpath('//item') as $news) { // for PHP > 5.1.2 get 'content' namespace $content = (array)$news->children('content'); $news = (array)$news; if ($news['guid']) { $guid = $news['guid']; } else { $guid = NULL; } if ((string)$news['title']) { $title = (string)$news['title']; } else { $title = ''; } if ((string)$news['description']) { $body = (string)$news['description']; } // some sources use content:encoded as description i.e. PostNuke PageSetter module elseif ((string)$news['encoded']) { //content:encoded for PHP < 5.1.2 $body = (string)$news['encoded']; } elseif ((string)$content['encoded']) { //content:encoded for PHP >= 5.1.2 $body = (string)$content['encoded']; } else { $body = $news['title']; } $teaser = node_teaser($body); if ($feed_XML->channel->title) { $original_author = (string)$feed_XML->channel->title; } if ($news['link']) { $original_url = $news['link']; } else { $original_url = NULL; } $timestamp = strtotime($news['pubDate']); if ($timestamp === FALSE) { $timestamp = time(); } $additional_taxonomies = array(); if ((string) $news['category'] || !empty($news['category'])) { if (is_array($news['category'])) { $news['category'] = $news['category'][0]; } $additional_taxonomies['RSS Categories'] = explode('/', $news['category']); } $item = new stdClass(); $item->title = $title; $item->description = $body; $item->options = new stdClass(); $item->options->teaser = $teaser; $item->options->original_author = $original_author; $item->options->timestamp = $timestamp; $item->options->original_url = $original_url; $item->options->guid = $guid; $item->options->tags = $additional_taxonomies['RSS Categories']; $parsed_source->items[] = $item; } return $parsed_source; }