'admin/settings/apachesolr/attachments', 'title' => t('Apache Solr Attachments Settings'), 'description' => t('Administer Apache Solr Attachments'), 'callback' => 'drupal_get_form', 'callback arguments' => 'apachesolr_attachments_settings', 'access' => user_access('administer site configuration'), ); } return $items; } /** * Displays the Attachment Settings Form. */ function apachesolr_attachments_settings() { $instruction_text = 'For each type of attachment, enter the path to the helper application installed on your server. "%file%" is a placeholder for the path of the attachment file and is required. If you don\'t want to search a type of attachment, leave the path setting blank (i.e., remove the content from the appropriate field below).'; $form['instructions'] = array( '#type' => 'markup', '#value' => t($instruction_text) ); $form['apachesolr_attachment_pdf_path'] = array( '#type' => 'textfield', '#title' => t('PDF Helper'), '#size' => 50, '#maxlength' => 100, '#description' => t("The full path to the helper for application/pdf files, plus any other parameters needed by the helper."), '#default_value' => variable_get('apachesolr_attachment_pdf_path', ''), ); $form['apachesolr_attachment_txt_path'] = array( '#type' => 'textfield', '#title' => t('Text Helper'), '#size' => 50, '#maxlength' => 100, '#description' => t("The full path to the helper for text/plain files, plus any other parameters needed by the helper."), '#default_value' => variable_get('apachesolr_attachment_txt_path', ''), ); $form['apachesolr_attachment_doc_path'] = array( '#type' => 'textfield', '#title' => t('Word Doc Helper'), '#size' => 50, '#maxlength' => 100, '#description' => t("The full path to the helper for application/msword files, plus any other parameters needed by the helper."), '#default_value' => variable_get('apachesolr_attachment_doc_path', ''), ); return system_settings_form($form); } /** * Implementation of hook_search(). */ function apachesolr_attachments_search($op = 'search', $keys = NULL) { switch ($op) { case 'name': // We dont want a tab return ''; case 'reset': ApacheSolrUpdate::reset(SOLR_ATTACHMENT_NS); return; case 'status': // Figure out a way to know how many are left to update, or expose it as part of the apachesolr module return; case 'search': return apachesolr_search_search($op, $keys); } } /** * Hook is called by search.module to add things to the search index. * In our case we will search content types and add any CCK type that * is a file type that we know how to parse and any uploaded file * attachments. */ function apachesolr_attachments_update_index() { $result = ApacheSolrUpdate::getNodesToIndex(SOLR_ATTACHMENT_NS); while ($row = db_fetch_object($result)) { // Variables to track the last item changed. $solr_last_change = $row->last_change; $solr_last_id = $row->nid; $node = node_load($row->nid); if ($node->nid) { // Since there is no notification for an attachment being unassociated with a // node (but that action will trigger it to be indexed again), lets remove // all indexed attachments then add all attached (if any) _asa_remove_attachments_from_index($node->nid); $files = _asa_get_indexable_files($node); if(!empty($files)) { // Update solr index. try { foreach ($files as $file) { // Some are arrays others are objects, treat them all as objects $file = (object) $file; $text = _asa_get_attachment_text($file); $text = trim($text); if (!empty($text)) { $document = new Apache_Solr_Document(); $site = url(NULL, NULL, NULL, TRUE); $hash = md5($site); $document->site = $site; $document->hash = $hash; $document->url = file_create_url($file->filepath); $document->id = $file->fid; $document->nid = $node->nid; $document->title = $file->filename; $document->changed = $node->changed; $document->uid = $node->uid; $document->body = $text; $document->text = "{$file->description} {$file->filename} $text"; $document->type = $node->type; $document->bsfield_isfile = TRUE; _as_configure_taxonomy($document, $node); // Let modules add to the document foreach (module_implements('apachesolr_attachments_update_index') as $module) { $function = $module .'_apachesolr_attachments_update_index'; $function($document, $node, $file); } $documents[] = $document; } } } catch (Exception $e) { watchdog(SOLR_ATTACHMENT_WD, $e->getMessage(), WATCHDOG_ERROR); } } ApacheSolrUpdate::success(SOLR_ATTACHMENT_NS, $solr_last_change, $solr_last_id); } } _as_index_documents($documents); } /** * Implementation of hook_nodeapi(). * * For a search result: Parse the nid and fid for a search result for potential use later. * For a delete: Remove all associated attachments from the Solr store. */ function apachesolr_attachments_nodeapi($node, $op) { switch ($op) { case 'delete': _asa_remove_attachments_from_index($node->nid); break; } } /** * Implementation of hook_apachesolr_process_results(). * * When using the core Apache Solr module, everythign is treated as a node and as such * the link and type wont be configured correctly if it is a file attachement, so override * those values here if needed. */ function apachesolr_attachments_apachesolr_process_results($results) { if (is_array($results)) { foreach ($results as &$item) { if (isset($item['node']->bsfield_isfile) && $item['node']->bsfield_isfile === TRUE) { $nid = $item['node']->nid; $node_title = db_result(db_query("SELECT title FROM {node} WHERE nid = %d", $nid)); $item['snippet'] = l($node_title, "node/$nid") . ': ' . $item['snippet']; } } } } /** * Return all file attachments for a particular node */ function _asa_get_indexable_files($node) { $files = array(); if(!empty($node->files)) { $files = array_merge($files, $node->files); } $fields = _asa_get_cck_file_fields(); foreach ($fields as $field) { if(!empty($node->$field)) { $files = array_merge($files, $node->$field); } } return $files; } /** * Return all CCK fields that are of type 'file' */ function _asa_get_cck_file_fields() { $file_fields = array(); if(module_exists('filefield')) { $fields = content_fields(); foreach($fields as $key => $values){ if($values['type'] == 'file') { $file_fields[] = $key; } } } return $file_fields; } /** * Parse the Attachment getting just the raw text, stripping any garbage characters that * could screw up the XML Doc processing. */ function _asa_get_attachment_text($file) { $helper_command = _asa_get_file_helper_command($file->filemime); // Empty entries in settings mean that helper is disabled. if ($helper_command == '') return ''; // %file% is a token that is placed in the helper's parameter list to represent // the file path to the attachment. $helper_command = preg_replace('/%file%/', "$file->filepath", $helper_command); $helper_command = escapeshellcmd($helper_command); $text = shell_exec($helper_command); // Strip anything that might make the Solr integration barf. // Wierd control characters make things behave wierd, especially in XML $cleaned_text = iconv("utf-8", "utf-8//IGNORE", $text); // As per robertDouglass - http://drupal.org/node/335871 // Bad control character. Do we need to make a hook for text cleanup? $cleaned_text = preg_replace('/\x0C/', '', $cleaned_text); return $cleaned_text; } /** * For a particular node id, remove all file attachments from the solr index. */ function _asa_remove_attachments_from_index($nid) { try { $solr = _get_solr_instance(); $solr->deleteByQuery("nid:{$nid} AND bsfield_isfile:true"); $solr->commit(); } catch (Exception $e) { watchdog(SOLR_ATTACHMENT_WD, $e->getMessage(), WATCHDOG_ERROR); } } /** * For a provided fid, get the file path. */ function _asa_get_file_url($fid) { if (!empty($fid) && is_numeric($fid)) { $result = db_query('SELECT * FROM {files} WHERE fid = %d', $fid); $file = db_fetch_array($result); return $file['filepath']; } } /** * Get the command to parse text out of a particular mime type. */ function _asa_get_file_helper_command($type) { // Determine helper based on file extension switch ($type) { case 'application/pdf': $cmd = variable_get('apachesolr_attachment_pdf_path',''); break; case 'text/plain': $cmd = variable_get('apachesolr_attachment_txt_path',''); break; case 'application/msword': $cmd = variable_get('apachesolr_attachment_doc_path',''); break; default: $cmd = ''; } return $cmd; } /** * Get a reference to the Solr service. */ function _asa_get_solr_instance() { try { return _get_solr_instance(); } catch (Exception $e) { watchdog(SOLR_ATTACHMENT_WD, $e->getMessage(), WATCHDOG_ERROR); } return FALSE; } /***************************************************************************************/ /** The following functions should become part of the Apache Solr module API **/ /***************************************************************************************/ /** * Get a reference to the Solr service. This consolidates cal to varaible_get, etc. */ function _get_solr_instance() { $host = variable_get('apachesolr_host', 'localhost'); $port = variable_get('apachesolr_port', 8983); $path = variable_get('apachesolr_path', '/solr'); $solr =& apachesolr_get_solr($host, $port, $path); if (!$solr->ping()) { throw new Exception(t('No Solr instance available')); } return $solr; } /** * Add taxonomy from the node to the solr document for the attachment. */ function _as_configure_taxonomy($document, $node) { if (is_array($node->taxonomy)) { foreach ($node->taxonomy as $term) { $document->setMultiValue('tid', $term->tid); // Double indexing of tids lets us do effecient searches (on tid) // and do accurate per-vocabulary faceting. $document->setMultiValue('imfield_vid' . $term->vid, $term->tid); $document->setMultiValue('vid', $term->vid); $document->setMultiValue('taxonomy_name', $term->name); } } } /** * Take the full list of Docs to submit to Solr and add them in batches. */ function _as_index_documents($documents) { $solr = _asa_get_solr_instance(); if (is_object($solr) && count($documents) > 0) { watchdog(SOLR_ATTACHMENT_WD, t("Adding @count documents to Solr", array('@count' => count($documents)))); try { // Chunk the adds by 50s $docs_chunk = array_chunk($documents, 50); foreach ($docs_chunk as $docs) { $solr->addDocuments($docs); } $solr->commit(); $solr->optimize(FALSE, FALSE); } catch (Exception $e) { watchdog(SOLR_ATTACHMENT_WD, $e->getMessage(), WATCHDOG_ERROR); } } }