url); if ($downloaded_string === FALSE || is_object($downloaded_string)) { return $downloaded_string; } if (!defined('LIBXML_VERSION') || (version_compare(phpversion(), '5.1.0', '<'))) { @ $xml = simplexml_load_string($downloaded_string, NULL); } else { @ $xml = simplexml_load_string($downloaded_string, NULL, LIBXML_NOERROR | LIBXML_NOWARNING | LIBXML_NOCDATA); } // Got a malformed XML. if ($xml === FALSE || is_null($xml)) { return FALSE; } } $feed_type = _parser_common_syndication_feed_format_detect($xml); if ($feed_type == "atom1.0") { return _parser_common_syndication_atom10_parse($xml); } if ($feed_type == "RSS2.0" || $feed_type == "RSS0.91" || $feed_type == "RSS0.92") { return _parser_common_syndication_RSS20_parse($xml); } if ($feed_type == "RDF") { return _parser_common_syndication_RDF10_parse($xml); } return FALSE; } /** * Get the cached version of the $url */ function _parser_common_syndication_cache_get($url) { $cache_file = _parser_common_syndication_sanitize_cache() .'/'. md5($url); if (file_exists($cache_file)) { $file_content = file_get_contents($cache_file); return unserialize($file_content); } return FALSE; } /** * Store the parsed feed into the cache */ function _parser_common_syndication_cache_set($url, $parsed_feed) { $cache_file = _parser_common_syndication_sanitize_cache() .'/'. md5($url); $cache_fp = fopen($cache_file, 'w'); fwrite($cache_fp, serialize($parsed_feed)); fclose($cache_fp); } /** * Get the content from the given URL. * * @param $url * A valid URL (not only web URLs). * @param $username * If the URL use authentication, here you can supply the username for this. * @param $password * If the URL use authentication, here you can supply the password for this. * @return * The data pulled from the URL or FALSE if the feed does not need refresh. */ function _parser_common_syndication_feedapi_get($url, $username = NULL, $password = NULL, $accept_invalid_cert = FALSE) { // Intra-pagedownload cache, avoid to download the same content twice within one page download (it's possible, compatible and parse calls). static $download_cache = array(); if (isset($download_cache[$url])) { return $download_cache[$url]; } $has_etag = FALSE; $curl = _parser_common_syndication_use_curl(); // Only download and parse data if really needs refresh. // Based on "Last-Modified" and "If-Modified-Since". $headers = array(); $db_result = db_query("SELECT etag, last_modified FROM {parser_common_syndication} WHERE url = '%s'", md5($url)); while ($validate = db_fetch_array($db_result)) { $has_etag = TRUE; if (!empty($validate['etag'])) { if ($curl) { $headers[] = 'If-None-Match: '. $validate['etag']; } else { $headers['If-None-Match'] = $validate['etag']; } } if (!empty($validate['last_modified'])) { if ($curl) { $headers[] = 'If-Modified-Since: '. $validate['last_modified']; } else { $headers['If-Modified-Since'] = $validate['last_modified']; } } if (!empty($username) && !$curl) { $headers['Authorization'] = 'Basic '. base64_encode("$username:$password"); } } if ($curl) { $headers[] = 'User-Agent: Drupal (+http://drupal.org/)'; $result = new stdClass(); $download = curl_init($url); curl_setopt($download, CURLOPT_FOLLOWLOCATION, TRUE); if (!empty($username)) { curl_setopt($download, CURLOPT_USERPWD, "{$username}:{$password}"); } curl_setopt($download, CURLOPT_HTTPHEADER, $headers); curl_setopt($download, CURLOPT_HEADER, TRUE); curl_setopt($download, CURLOPT_RETURNTRANSFER, TRUE); if ($accept_invalid_cert) { curl_setopt($download, CURLOPT_SSL_VERIFYPEER, 0); } $header = ''; $data = curl_exec($download); $header_size = curl_getinfo($download, CURLINFO_HEADER_SIZE); $header = substr($data, 0, $header_size - 1); $result->data = substr($data, $header_size); $header_lines = preg_split("/\r\n|\n|\r/", $header); $result->headers = array(); array_shift($header_lines); // skip HTTP response status while ($line = trim(array_shift($header_lines))) { list($header, $value) = explode(':', $line, 2); if (isset($result->headers[$header]) && $header == 'Set-Cookie') { // RFC 2109: the Set-Cookie response header comprises the token Set- // Cookie:, followed by a comma-separated list of one or more cookies. $result->headers[$header] .= ','. trim($value); } else { $result->headers[$header] = trim($value); } } $result->code = curl_getinfo($download, CURLINFO_HTTP_CODE); curl_close($download); } else { $result = drupal_http_request($url, $headers); } $result->code = isset($result->code) ? $result->code : 200; // In this case return the cached data. if ($result->code == 304) { $cached_data = _parser_common_syndication_cache_get($url); if (is_object($cached_data)) { $cached_data->from_cache = TRUE; return $cached_data; } else { // It's a tragedy, this file has to be exist and contain good data. // In this case, repeat the stuff without cache. db_query("DELETE FROM {parser_common_syndication} WHERE url = '%s'", md5($url)); return _parser_common_syndication_feedapi_get($url, $username, $password); } } if (!isset($result->headers) || !isset($result->headers['ETag']) || !isset($result->headers['Last-Modified'])) { $result->headers = isset($result->headers) ? $result->headers : array(); $result->headers['ETag'] = isset($result->headers['ETag']) ? $result->headers['ETag'] : ''; $result->headers['Last-Modified'] = isset($result->headers['Last-Modified']) ? $result->headers['Last-Modified'] : ''; } if ($has_etag == TRUE) { db_query("UPDATE {parser_common_syndication} SET etag = '%s', last_modified = '%s' WHERE url = '%s'", $result->headers['ETag'], $result->headers['Last-Modified'], md5($url)); } else { db_query("INSERT INTO {parser_common_syndication} (etag, last_modified, url) VALUES ('%s', '%s', '%s')", $result->headers['ETag'], $result->headers['Last-Modified'], md5($url)); } $download_cache[$url] = $result->data; return empty($result->data) ? FALSE : $result->data; } /** * Delete cache validating functions when feed is deleted */ function parser_common_syndication_nodeapi(&$node, $op) { if (isset($node->feed) || feedapi_enabled_type($node->type)) { switch ($op) { case 'delete': db_query("DELETE FROM {parser_common_syndication} WHERE url = '%s'", $node->feed->url); $cache_dir = _parser_common_syndication_sanitize_cache(); $cache_filename = $cache_dir .'/'. md5($node->feed->url); if (file_exists($cache_filename)) { unlink($cache_filename); } break; } } } /** * Determine the feed format of a SimpleXML parsed object structure. * * @param $xml * SimpleXML-preprocessed feed. * @return * The feed format short description or FALSE if not compatible. */ function _parser_common_syndication_feed_format_detect($xml) { if (!is_object($xml)) { return FALSE; } $attr = $xml->attributes(); $type = strtolower($xml->getName()); if (isset($xml->entry) && $type == "feed") { return "atom1.0"; } if ($type == "rss" && $attr["version"] == "2.0") { return "RSS2.0"; } if ($type == "rdf" && isset($xml->channel)) { return "RDF"; } if ($type == "rss" && $attr["version"] == "0.91") { return "RSS0.91"; } if ($type == "rss" && $attr["version"] == "0.92") { return "RSS0.92"; } return FALSE; } /** * Call one of the possible feedapi_get hook and pass back the downloaded data * * @return * string - the downloaded data, FALSE - if the URL is not reachable */ function _parser_common_syndication_download($url, $settings = NULL) { if (valid_url($url, TRUE)) { // Handle password protected feeds. $url_parts = parse_url($url); $password = $username = NULL; if (!empty($url_parts['user'])) { $password = $url_parts['pass']; $username = $url_parts['user']; } } $accept_invalid_cert = isset($settings['accept_invalid_cert']) ? $settings['accept_invalid_cert'] : FALSE; $downloaded_string = _parser_common_syndication_feedapi_get($url, $username, $password, $accept_invalid_cert); // Cannot get the feed, pass the problem to one level up. if ($downloaded_string == FALSE) { return FALSE; } // The data comes from cache, just pass one level up. else if (is_object($downloaded_string)) { return $downloaded_string; } // Do the autodiscovery at this level, pass back the real data. // Maybe it's HTML. If it's not HTML, not worth to take a look into the downloaded string. if (strpos(strtolower($downloaded_string), "/si', $downloaded_string, $matches); $links = $matches[1]; $rss_link = FALSE; foreach ($links as $link) { $mime = array(); // Get the type attribute and check if the mime type is allowed. preg_match_all('/type\s*=\s*("|\')([A-Za-z\/+]*)("|\')/si', $link, $mime); if (in_array(array_pop($mime[2]), $allowed_mime)) { $href = array(); // Get the href attribute. preg_match_all('/href\s*=\s*("|\')([=#\?_:.0-9A-Za-z\/+]*)("|\')/si', $link, $href); $rss_link = array_pop($href[2]); if (is_string($rss_link) && strlen($rss_link) > 0 && $rss_link != $url) { // Handle base url related stuff. $parsed_url = parse_url($rss_link); if (!isset($parsed_url['host'])) { // It's relative so make it absolute. $base_tag = array(); preg_match_all('/ 0) { // Get from the HTML base tag. $rss_link = $base_url . $rss_link; } else { // Guess from the original URL. $original_url = parse_url($url); $rss_link = $original_url['scheme'] .'://'. $original_url['host'] . (isset($original_url['port']) ? ':' : '') . $original_url['port'] . $parsed_url['path'] .'?'. $parsed_url['query'] .'#'. $parsed_url['fragment']; } } $downloaded_string = _parser_common_syndication_download($rss_link); break; } } } } // Ugly hack to be able to retrieve the xml:base property, no way to access xml:lang inside $downloaded_string = preg_replace('/xml:base *=/', 'base=', $downloaded_string); // Filter out strange tags. Without this, the text would contain strange stuff. // @todo: make sure that these are not important for feed element mapper $downloaded_string_filtered = preg_replace(array('@]*?.*?@si', '@]*?.*?@si', '@]*?.*?@si', '@]*?.*?@si', '@]*?.*?@si', '@]*?.*?@si', '@]*?.*?@si'), '', $downloaded_string); return empty($downloaded_string_filtered) ? $downloaded_string : $downloaded_string_filtered; } /** * Parse atom feeds. */ function _parser_common_syndication_atom10_parse($feed_XML) { $parsed_source = new stdClass(); $base = (string) array_shift($feed_XML->xpath("@base")); if (!valid_url($base, TRUE)) { $base = FALSE; } // Detect the title $parsed_source->title = isset($feed_XML->title) ? _parser_common_syndication_title("{$feed_XML->title}") : ""; // Detect the description $parsed_source->description = isset($feed_XML->subtitle) ? "{$feed_XML->subtitle}" : ""; $parsed_source->options = new stdClass(); $parsed_source->options->link = _parser_common_syndication_link($feed_XML->link); if (valid_url($parsed_source->options->link) && !valid_url($parsed_source->options->link, TRUE) && !empty($base)) { $parsed_source->options->link = $base . $parsed_source->options->link; } $parsed_source->items = array(); foreach ($feed_XML->entry as $news) { $original_url = NULL; $guid = !empty($news->id) ? "{$news->id}" : NULL; // I don't know how standard this is, but sometimes the id is the URL. if (valid_url($guid, TRUE)) { $original_url = $guid; } $additional_taxonomies = array(); if (isset($news->category)) { $additional_taxonomies['ATOM Categories'] = array(); $additional_taxonomies['ATOM Domains'] = array(); foreach ($news->category as $category) { if (isset($category['scheme'])) { $domain = "{$category['scheme']}"; if (!empty($domain)) { if (!isset($additional_taxonomies['ATOM Domains'][$domain])) { $additional_taxonomies['ATOM Domains'][$domain] = array(); } $additional_taxonomies['ATOM Domains'][$domain][] = count($additional_taxonomies['ATOM Categories']) - 1; } } $additional_taxonomies['ATOM Categories'][] = "{$category['term']}"; } } $title = "{$news->title}"; $body = ''; if (!empty($news->content)) { foreach ($news->content->children() as $child) { $body .= $child->asXML(); } $body .= "{$news->content}"; } else if (!empty($news->summary)) { foreach ($news->summary->children() as $child) { $body .= $child->asXML(); } $body .= "{$news->summary}"; } if (!empty($news->content['src'])) { // some src elements in some valid atom feeds contained no urls at all if (valid_url("{$news->content['src']}", TRUE)) { $original_url = "{$news->content['src']}"; } } $author_found = FALSE; if (!empty($news->source->author->name)) { $original_author = "{$news->source->author->name}"; $author_found = TRUE; } else if (!empty($news->author->name)) { $original_author = "{$news->author->name}"; $author_found = TRUE; } if (!empty($feed_XML->author->name) && !$author_found) { $original_author = "{$feed_XML->author->name}"; } $original_url = _parser_common_syndication_link($news->link); $item = new stdClass(); $item->title = _parser_common_syndication_title($title, $body); $item->description = $body; $item->options = new stdClass(); $item->options->original_author = $original_author; $item->options->timestamp = _parser_common_syndication_parse_date(isset($news->published) ? "{$news->published}" : "{$news->issued}"); $item->options->original_url = trim($original_url); if (valid_url($item->options->original_url) && !valid_url($item->options->original_url, TRUE) && !empty($base)) { $item->options->original_url = $base . $item->options->original_url; } $item->options->guid = $guid; $item->options->tags = isset($additional_taxonomies['ATOM Categories']) ? $additional_taxonomies['ATOM Categories'] : array(); $item->options->domains = isset($additional_taxonomies['ATOM Domains']) ? $additional_taxonomies['ATOM Domains'] : array(); $parsed_source->items[] = $item; } return $parsed_source; } /** * Parse RDF Site Summary (RSS) 1.0 feeds in RDF/XML format. * * @see http://web.resource.org/rss/1.0/ */ function _parser_common_syndication_RDF10_parse($feed_XML) { // Declare some canonical standard prefixes for well-known namespaces: static $canonical_namespaces = array( 'rdf' => 'http://www.w3.org/1999/02/22-rdf-syntax-ns#', 'rdfs' => 'http://www.w3.org/2000/01/rdf-schema#', 'xsi' => 'http://www.w3.org/2001/XMLSchema-instance#', 'xsd' => 'http://www.w3.org/2001/XMLSchema#', 'owl' => 'http://www.w3.org/2002/07/owl#', 'dc' => 'http://purl.org/dc/elements/1.1/', 'dcterms' => 'http://purl.org/dc/terms/', 'dcmitype' => 'http://purl.org/dc/dcmitype/', 'foaf' => 'http://xmlns.com/foaf/0.1/', 'rss' => 'http://purl.org/rss/1.0/', ); // Get all namespaces declared in the feed element, with special handling // for PHP versions prior to 5.1.2 as they don't handle namespaces. $namespaces = version_compare(phpversion(), '5.1.2', '<') ? array() : $feed_XML->getNamespaces(TRUE); // Process the resource containing feed metadata: foreach ($feed_XML->children($canonical_namespaces['rss'])->channel as $rss_channel) { $parsed_source = (object)array( 'title' => _parser_common_syndication_title((string)$rss_channel->title), 'description' => (string)$rss_channel->description, 'options' => (object)array('link' => (string)$rss_channel->link), 'items' => array(), ); break; } // Process each resource contained in the feed: foreach ($feed_XML->children($canonical_namespaces['rss'])->item as $rss_item) { // Extract all available RDF statements from the feed item's RDF/XML // tags, allowing for both the item's attributes and child elements to // contain RDF properties: $rdf_data = array(); foreach ($namespaces as $ns => $ns_uri) { // Note that we attempt to normalize the found property name // namespaces to well-known 'standard' prefixes where possible, as the // feed may in principle use any arbitrary prefixes and we should // still be able to correctly handle it. foreach ($rss_item->attributes($ns_uri) as $attr_name => $attr_value) { $ns_prefix = ($ns_prefix = array_search($ns_uri, $canonical_namespaces)) ? $ns_prefix : $ns; $rdf_data[$ns_prefix .':'. $attr_name][] = (string)$attr_value; } foreach ($rss_item->children($ns_uri) as $rss_property) { $ns_prefix = ($ns_prefix = array_search($ns_uri, $canonical_namespaces)) ? $ns_prefix : $ns; $rdf_data[$ns_prefix .':'. $rss_property->getName()][] = (string)$rss_property; } } // Declaratively define mappings that determine how to construct the // object that gets passed back to FeedAPI: $item = _parser_common_syndication_RDF10_item($rdf_data, (object)array( 'title' => array('rss:title', 'dc:title'), 'description' => array('rss:description', 'dc:description', 'content:encoded'), 'options' => (object)array( 'guid' => 'rdf:about', 'timestamp' => 'dc:date', 'original_author' => array('dc:creator', 'dc:publisher'), 'original_url' => array('rss:link', 'rdf:about'), 'tags' => 'dc:subject', ), )); // Special handling for the title: $item->title = _parser_common_syndication_title($item->title, $item->description); // Parse any date/time values into Unix timestamps: $item->options->timestamp = _parser_common_syndication_parse_date($item->options->timestamp); // If no author name found, use the feed title: if (empty($item->options->original_author)) { $item->options->original_author = $parsed_source->title; } // Add every found RDF property to the FeedAPI item in order for Feed // Element Mapper to be able to map these properties: $item->rdf = (object)array(); foreach ($rdf_data as $rdf_property => $rdf_value) { $rdf_property = str_replace(':', '_', $rdf_property); // looks nicer in the mapper UI $item->rdf->$rdf_property = $rdf_value; } $parsed_source->items[] = $item; } return $parsed_source; } function _parser_common_syndication_RDF10_property($rdf_data, $rdf_properties = array()) { $rdf_properties = is_array($rdf_properties) ? $rdf_properties : array_slice(func_get_args(), 1); foreach ($rdf_properties as $rdf_property) { if ($rdf_property && !empty($rdf_data[$rdf_property])) { return array_filter($rdf_data[$rdf_property], 'strlen'); // remove empty strings } } } function _parser_common_syndication_RDF10_item($rdf_data, $mappings) { foreach (get_object_vars($mappings) as $k => $v) { if (is_object($v)) { $mappings->$k = _parser_common_syndication_RDF10_item($rdf_data, $v); } else { $values = _parser_common_syndication_RDF10_property($rdf_data, $v); $mappings->$k = !is_array($values) || count($values) > 1 ? $values : reset($values); } } return (object)$mappings; } /** * Parse RSS2.0 feeds. */ function _parser_common_syndication_RSS20_parse($feed_XML) { $parsed_source = new stdClass(); // Detect the title. $parsed_source->title = isset($feed_XML->channel->title) ? _parser_common_syndication_title("{$feed_XML->channel->title}") : ""; // Detect the description. $parsed_source->description = isset($feed_XML->channel->description) ? "{$feed_XML->channel->description}" : ""; $parsed_source->options = new stdClass(); // Detect the link. $parsed_source->options->link = isset($feed_XML->channel->link) ? "{$feed_XML->channel->link}" : ""; $parsed_source->items = array(); foreach ($feed_XML->xpath('//item') as $news) { $category = $news->xpath('category'); // Get children for current namespace. if (version_compare(phpversion(), '5.1.2', '>')) { $content = (array)$news->children('http://purl.org/rss/1.0/modules/content/'); } $news = (array) $news; $news['category'] = $category; if (isset($news['guid'])) { $guid = "{$news['guid']}"; } else { $guid = NULL; } if (isset($news['title'])) { $title = "{$news['title']}"; } else { $title = ''; } if (isset($news['description'])) { $body = "{$news['description']}"; } // Some sources use content:encoded as description i.e. PostNuke PageSetter module. if (isset($news['encoded'])) { // content:encoded for PHP < 5.1.2. if (strlen($body) < strlen("{$news['encoded']}")) { $body = "{$news['encoded']}"; } } if (isset($content['encoded'])) { // content:encoded for PHP >= 5.1.2. if (strlen($body) < strlen("{$content['encoded']}")) { $body = "{$content['encoded']}"; } } if (!isset($body)) { $body = "{$news['title']}"; } if (!empty($feed_XML->channel->title)) { $original_author = "{$feed_XML->channel->title}"; } if (!empty($news['link'])) { $original_url = "{$news['link']}"; } else { $original_url = NULL; } $additional_taxonomies = array(); $additional_taxonomies['RSS Categories'] = array(); $additional_taxonomies['RSS Domains'] = array(); if (isset($news['category'])) { foreach ($news['category'] as $category) { $additional_taxonomies['RSS Categories'][] = "{$category}"; if (isset($category['domain'])) { $domain = "{$category['domain']}"; if (!empty($domain)) { if (!isset($additional_taxonomies['RSS Domains'][$domain])) { $additional_taxonomies['RSS Domains'][$domain] = array(); } $additional_taxonomies['RSS Domains'][$domain][] = count($additional_taxonomies['RSS Categories']) - 1; } } } } $item = new stdClass(); $item->title = _parser_common_syndication_title($title, $body); $item->description = $body; $item->options = new stdClass(); $item->options->original_author = $original_author; if (isset($news['pubDate'])) { $item->options->timestamp = _parser_common_syndication_parse_date($news['pubDate']); } else { $item->options->timestamp = time(); } $item->options->original_url = trim($original_url); $item->options->guid = $guid; $item->options->domains = $additional_taxonomies['RSS Domains']; $item->options->tags = $additional_taxonomies['RSS Categories']; $parsed_source->items[] = $item; } return $parsed_source; } /** * Set the default caching directory if the current setting is not useable */ function _parser_common_syndication_sanitize_cache() { $cache_location = file_directory_path() .'/parser_common_syndication_cache'; if (!is_writeable($cache_location) || !is_dir($cache_location)) { $cache_location = file_create_path($cache_location); if (!file_exists($cache_location) && is_writable(file_directory_path())) { mkdir($cache_location); } if (!is_writeable($cache_location)) { return FALSE; } } return $cache_location; } /** * Parse a date comes from a feed. * * @param $date_string * The date string in various formats. * @return * The timestamp of the string or the current time if can't be parsed */ function _parser_common_syndication_parse_date($date_str) { $parsed_date = strtotime($date_str); if ($parsed_date === FALSE || $parsed_date == -1) { $parsed_date = _parser_common_syndication_parse_w3cdtf($date_str); } return $parsed_date === FALSE ? time() : $parsed_date; } /** * Parse the W3C date/time format, a subset of ISO 8601. * * PHP date parsing functions do not handle this format. * See http://www.w3.org/TR/NOTE-datetime for more information. * Originally from MagpieRSS (http://magpierss.sourceforge.net/). * * @param $date_str * A string with a potentially W3C DTF date. * @return * A timestamp if parsed successfully or FALSE if not. */ function _parser_common_syndication_parse_w3cdtf($date_str) { if (preg_match('/(\d{4})-(\d{2})-(\d{2})T(\d{2}):(\d{2})(:(\d{2}))?(?:([-+])(\d{2}):?(\d{2})|(Z))?/', $date_str, $match)) { list($year, $month, $day, $hours, $minutes, $seconds) = array($match[1], $match[2], $match[3], $match[4], $match[5], $match[6]); // Calculate the epoch for current date assuming GMT. $epoch = gmmktime($hours, $minutes, $seconds, $month, $day, $year); if ($match[10] != 'Z') { // Z is zulu time, aka GMT list($tz_mod, $tz_hour, $tz_min) = array($match[8], $match[9], $match[10]); // Zero out the variables. if (!$tz_hour) { $tz_hour = 0; } if (!$tz_min) { $tz_min = 0; } $offset_secs = (($tz_hour * 60) + $tz_min) * 60; // Is timezone ahead of GMT? If yes, subtract offset. if ($tz_mod == '+') { $offset_secs *= -1; } $epoch += $offset_secs; } return $epoch; } else { return FALSE; } } /** * Extract the link that points to the original content (back to site or original article) * * @param $links * Array of SimpleXML objects */ function _parser_common_syndication_link($links) { $to_link = ''; if (count($links) > 0) { foreach ($links as $link) { $link = $link->attributes(); $to_link = isset($link["href"]) ? "{$link["href"]}" : ""; if (isset($link["rel"])) { if ("{$link["rel"]}" == 'alternate') { break; } } } } return $to_link; } /** * Prepare raw data to be a title */ function _parser_common_syndication_title($title, $body = FALSE) { if (empty($title) && !empty($body)) { // Explode to words and use the first 3 words. $words = preg_split("/[\s,]+/", $body); $title = $words[0] .' '. $words[1] .' '. $words[2]; } return $title; }