<?php
/* $Id: parser_common_syndication.module,v 1.6 2007-07-23 17:27:08 aronnovak Exp $ */

/**
 * @file
 * Parse the incoming URL with SimpleXML then provide a data structure of the feed 
 * Require PHP5 because of SimpleXML
 */

/**
 * Implementation of hook_help().
 */
function parser_common_syndication_help($section) {
  switch($section) {
    case 'admin/modules#description':
      return t('Provide a common syndication parser for FeedAPI-compatible modules');
      break;
  }
}

/**
 * Implementation of hook_feedapi_compatible().
 *
 * @param $url
 *  The feed's url
 * @return
 *  a string - feed type if the parser is able to process it, FALSE if it's not compatible 
 */
function parser_common_syndication_feedapi_compatible($url) {
  if (!function_exists('simplexml_load_string')) {
    return FALSE;
  }
  $downloaded_string = _parser_common_syndication_download($url);
  if (!defined('LIBXML_VERSION') || (version_compare(phpversion(), '5.1.0', '<'))) {
    @ $xml = simplexml_load_string($downloaded_string, NULL);
  }
  else {
    @ $xml = simplexml_load_string($downloaded_string, NULL, LIBXML_NOERROR | LIBXML_NOWARNING);
  }
  if (_parser_common_syndication_feed_format_detect($xml) != FALSE) {
    // We don't have to choose between the types, because this module is only able to parse one
    return array_shift(parser_common_syndication_feedapi_type());
  }
  return FALSE;
}

/**
 * Implementation of hook_feedapi_parse().
 *
 * @param $url
 *  The feed's url
 * @return stdClass
 *  The structured datas extracted from the feed
 */
function parser_common_syndication_feedapi_parse($feed) {
  $downloaded_string = _parser_common_syndication_download($feed->url);
  if ($downloaded_string == FALSE) {
    return FALSE;
  }
  
  if (!defined('LIBXML_VERSION') || (version_compare(phpversion(), '5.1.0', '<'))) {
    @ $xml = simplexml_load_string($downloaded_string, NULL);
  }
  else {
    @ $xml = simplexml_load_string($downloaded_string, NULL, LIBXML_NOERROR | LIBXML_NOWARNING);
  }

  // We got a malformed XML
  if ($xml === FALSE || $xml == NULL) {
    return FALSE;
  }

  $feed_type = _parser_common_syndication_feed_format_detect($xml);
  if ($feed_type ==  "atom1.0") {
    return _parser_common_syndication_atom10_parse($xml);
  }
  if ($feed_type == "RSS2.0" || $feed_type == "RSS0.91" || $feed_type == "RSS0.92") {
    return _parser_common_syndication_RSS20_parse($xml);
  }
  if ($feed_type == "RDF") {
    return _parser_common_syndication_RDF10_parse($xml);
  }
  return FALSE;
}

/**
 * Implementation of hook_feedapi_type().
 * Define the feed types that this module is able to handle
 *
 * @return
 *  The types
 */
function parser_common_syndication_feedapi_type() {
  return array("XML feed");
}

/**
 * Determine the feed format of a SimpleXML parsed object structure
 *
 * @param $xml
 *  SimpleXML-preprocessed feed
 * @return
 *  a string - means the feed format
 */
function _parser_common_syndication_feed_format_detect($xml) {
  if (!is_object($xml)) {
    return FALSE;
  }
  $attr = $xml->attributes();
  //print_r($xml);
  if (isset($xml->entry) && strtolower($xml->getName()) == "feed") {
    return "atom1.0";
  }
  if (strtolower($xml->getName()) == "rss" && $attr["version"] == "2.0") {
    return "RSS2.0";
  }
  if (strtolower($xml->getName()) == "rdf" && isset($xml->channel)) {
    return "RDF";
  }
  if (strtolower($xml->getName()) == "rss" && $attr["version"] == "0.91") {
    return "RSS0.91";
  }
  if (strtolower($xml->getName()) == "rss" && $attr["version"] == "0.92") {
    return "RSS0.92";
  }
  return FALSE;
}

/**
 * Call one of the possible feedapi_get hook and pass back the downloaded data
 *
 * @return
 *  string - the downloaded data, FALSE - if the URL is not reachable
 */
function _parser_common_syndication_download($url) {
  $downloaders = module_implements("feedapi_get");
  $downloaded_string = "";

  $this_types = parser_common_syndication_feedapi_type();
  // Pick one module that can able to download this
  foreach ($downloaders as $concrete_module) {
    $types = module_invoke($concrete_module, "feedapi_type");
    // If the downloader can get this type of content
    if (count(array_intersect($this_types, $types)) > 0) {
      $downloaded_string = module_invoke($concrete_module, "feedapi_get", $url);
      break;
    }
  }

  // Cannot get the feed, pass the problem to one level upper
  if ($downloaded_string == "") {
    return FALSE;
  }
  return $downloaded_string;
}

/**
 * Parse atom feeds
 */
function _parser_common_syndication_atom10_parse($feed_XML) {
  $parsed_source = new stdClass();
  // Detect the title
  $parsed_source->title = isset($feed_XML->title) ? (string) $feed_XML->title : "";
  // Detect the description
  $parsed_source->description = isset($feed_XML->subtitle) ? (string) $feed_XML->subtitle : "";
  $parsed_source->options = new stdClass();
  // Detect the link
  $parsed_source->options->link = "";
  if (count($feed_XML->link) > 0) {
    $link = $feed_XML->link;
    $link = $link->attributes();
    $parsed_source->options->link = isset($link["href"]) ? (string) $link["href"] : "";
  }

  $parsed_source->items = array();

  foreach ($feed_XML->entry as $news) {
    $original_url = NULL;

    if ($news->id) {
      $guid = "{$news->id}";
    }
    else {
      $guid = NULL;
    }

    // I don't know how standard this is, but sometimes the id is the URL
    if (valid_url($guid, TRUE)) {
      $original_url = $guid;
    }

    $additional_taxonomies = array();

    if ($news->category) {
      $additional_taxonomies['ATOM Categories'] = array();
      foreach ($news->category AS $category) {
        $additional_taxonomies['ATOM Categories'][] = "{$category['term']}";
      }
    }

    $title = "{$news->title}";

    if ($news->content) {
      $body = '';
      foreach($news->content->children() as $child)  {
        $body .= $child->asXML();
      }
      $body .= "{$news->content}";
    }
    else if ($news->summary) {
      $body = '';
      foreach($news->summary->children() as $child)  {
        $body .= $child->asXML();
      }
      $body .= "{$news->summary}";
    }

    if ($news->content['src']) {
      // some src elements in some valid atom feeds contained no urls at all
      if (valid_url("{$news->content['src']}", TRUE)) {
        $original_url = "{$news->content['src']}";
      }
    }

    if ($news->summary) {
      $teaser = '';
      foreach($news->summary->children() as $child) {
        $teaser .= $child->asXML();
      }
      $teaser .= "{$news->summary}";
    }
    else {
      $teaser = node_teaser($body);
    }

    $author_found = FALSE;

    if ($news->source->author->name) {
      $original_author = "{$news->source->author->name}";
      $author_found = TRUE;
    }
    else if ($news->author->name) {
      $original_author = "{$news->author->name}";
      $author_found = TRUE;
    }

    if ($feed_XML->author->name && !$author_found) {
      $original_author = "{$feed_XML->author->name}";
    }

    if ($news->link['href'] && valid_url("{$news->link['href']}", TRUE)) {
      $original_url = "{$news->link['href']}";
    }

    $timestamp = strtotime("{$news->published}");
    if ($timestamp === FALSE) {
      $timestamp = time();
    }
    $item = new stdClass();
    $item->title = $title;
    $item->description = $body;
    $item->options = new stdClass();
    $item->options->teaser = $teaser;
    $item->options->original_author = $original_author;
    $item->options->timestamp = $timestamp;
    $item->options->original_url = $original_url;
    $item->options->guid = $guid;
    $item->options->tags = $additional_taxonomies['ATOM Categories'];
    //_aggregation_add_item($title, $body, $teaser, $original_author, $feed, $additional_taxonomies, $timestamp, $original_url, $guid, array());
    $parsed_source->items[] = $item;
  }
  return $parsed_source;
}

/**
 * Parse RSS1.0/RDF feeds
 */
function _parser_common_syndication_RDF10_parse($feed_XML) {
  $parsed_source = new stdClass();
  // Detect the title
  $parsed_source->title = isset($feed_XML->channel->title) ? (string) $feed_XML->channel->title : "";
  // Detect the description
  $parsed_source->description = isset($feed_XML->channel->description) ? (string) $feed_XML->channel->description : "";
  $parsed_source->options = new stdClass();
  // Detect the link
  $parsed_source->options->link = isset($feed_XML->channel->link) ? (string) $feed_XML->channel->link : "";
  $parsed_source->items = array();

  // set category splitter (space is for del.icio.us feed)
  $category_splitter = ' ';

  // get the default original author
  if ($feed_XML->channel->title) {
    $oa = (string) $feed_XML->channel->title;
  }

  // get all namespaces
  if (version_compare(phpversion(), '5.1.2', '<')) {
    //versions prior 5.1.2 don't allow namespaces
    $namespaces['default'] = NULL;
  }
  else {
    $namespaces = $feed_XML->getNamespaces(TRUE);
  }

  foreach ($feed_XML->item as $news) {
    //initialization
    $guid = $original_url = NULL;
    $title = $body = $teaser = '';
    $timestamp = time();
    $additional_taxonomies = array();
    $original_author = $oa;

    foreach($namespaces as $ns_link) {
      //get about attribute as guid
      foreach ($news->attributes($ns_link) as $name => $value) {
        if ($name == 'about') {
          $guid = $value;
        }
      }

      //get children for current namespace
      if (version_compare(phpversion(), '5.1.2', '<')) {
        $ns = (array)$news;
      }
      else {
        $ns = (array)$news->children($ns_link);
      }

      //title
      if ((string)$ns['title']) {
        $title = (string)$ns['title'];
      }

      //description or dc:description
      if ((string)$ns['description'] && $body <> '') {
        $body = (string)$ns['description'];
      }

      //link
      if ((string)$ns['link']) {
        $original_url = (string)$ns['link'];
      }

      //dc:creator
      if ((string)$ns['creator']) {
        $original_author = (string)$ns['creator'];
      }

      //dc:date
      if ((string)$ns['date']) {
        $timestamp = strtotime((string)$ns['date']);
      }

      //content:encoded
      if ((string)$ns['encoded']) {
        $body = (string)$ns['encoded'];
      }

      //dc:subject
      if ((string)$ns['subject'])	{
        //there can be multiple category tags
        if (is_array($ns['subject']))	{
          foreach ($ns['subject'] as $cat) {
            if (is_object($cat)) {
              $additional_taxonomies['RDF Categories'][] = trim(strip_tags($cat->asXML()));
            }
            else {
              $additional_taxonomies['RDF Categories'][] = $cat;
            }
          }
        }
        else { //or single tag
          $additional_taxonomies['RDF Categories'] = explode($category_splitter, (string)$ns['subject']);
        }
      }
    }

    // description is not mandatory so use title if description not present
    if (!$body) {
      $body = $title;
    }

    //make teaser
    $teaser = node_teaser($body);

    // if there are no link tag but rdf:about is provided
    if (!$original_url && $guid) {
      $original_url = $guid;
    }
    $item = new stdClass();
    $item->title = $title;
    $item->description = $body;
    $item->options = new stdClass();
    $item->options->teaser = $teaser;
    $item->options->original_author = $original_author;
    $item->options->timestamp = $timestamp;
    $item->options->original_url = $original_url;
    $item->options->guid = $guid;
    $item->options->link = $additional_taxonomies['RDF Categories'];
    $parsed_source->items[] = $item;
  }
  return $parsed_source;
}

/**
 * Parse RSS2.0 feeds
 */
function _parser_common_syndication_RSS20_parse($feed_XML) {
  $parsed_source = new stdClass();
  // Detect the title
  $parsed_source->title = isset($feed_XML->channel->title) ? (string) $feed_XML->channel->title : "";
  // Detect the description
  $parsed_source->description = isset($feed_XML->channel->description) ? (string) $feed_XML->channel->description : "";
  $parsed_source->options = new stdClass();
  // Detect the link
  $parsed_source->options->link = isset($feed_XML->channel->link) ? (string) $feed_XML->channel->link : "";
  $parsed_source->items = array();

  foreach ($feed_XML->xpath('//item') as $news)	{
    // for PHP > 5.1.2 get 'content' namespace
    $content = (array)$news->children('content');

    $news = (array)$news;

    if ($news['guid']) {
      $guid = $news['guid'];
    }
    else {
      $guid = NULL;
    }

    if ((string)$news['title']) {
      $title = (string)$news['title'];
    }
    else {
      $title = '';
    }

    if ((string)$news['description']) {
      $body = (string)$news['description'];
    }
    // some sources use content:encoded as description i.e. PostNuke PageSetter module
    elseif ((string)$news['encoded']) {  //content:encoded for PHP < 5.1.2
      $body = (string)$news['encoded'];
    }
    elseif ((string)$content['encoded']) { //content:encoded for PHP >= 5.1.2
      $body = (string)$content['encoded'];
    }
    else {
      $body = $news['title'];
    }

    $teaser = node_teaser($body);

    if ($feed_XML->channel->title) {
      $original_author = (string)$feed_XML->channel->title;
    }

    if ($news['link']) {
      $original_url = $news['link'];
    }
    else {
      $original_url = NULL;
    }

    $timestamp = strtotime($news['pubDate']);
    if ($timestamp === FALSE) {
      $timestamp = time();
    }

    $additional_taxonomies = array();
    if ((string) $news['category'] || !empty($news['category']))	{
      if (is_array($news['category'])) {
        $news['category'] = $news['category'][0];
      }
      $additional_taxonomies['RSS Categories'] = explode('/', $news['category']);
    }

    $item = new stdClass();
    $item->title = $title;
    $item->description = $body;
    $item->options = new stdClass();
    $item->options->teaser = $teaser;
    $item->options->original_author = $original_author;
    $item->options->timestamp = $timestamp;
    $item->options->original_url = $original_url;
    $item->options->guid = $guid;
    $item->options->tags = $additional_taxonomies['RSS Categories'];
    $parsed_source->items[] = $item;
  }
  return $parsed_source;
}