<?php
// $Id: recommender.module,v 1.7 2009-02-28 20:49:53 danithaca Exp $

/**
 * @file
 * Providing generic recommender system algorithms.
 */


/**
 * classical collaborative filtering algorithm based on correlation coefficient.
 * could be used in the classical user-user or item-item algorithm
 * see the README file for more details
 * 
 * @param $app_name the application name that uses this function.
 * @param $table_name the input table name
 * @param $field_mouse the input table field for mouse
 * @param $field_cheese the input table field for cheese
 * @param $field_weight the input table field weight
 * @param $options an array of options
 *          'performance': whether to do calculation in memory or in database. 'auto' is to decide automatically.
 *          'missing': how to handle missing data -- 'none' do nothing; 'zero' fill in missing data with zero; 'adjusted' skip mice that don't share cheese in common.
 *          'sensitivity': if similarity is smaller enough to be less than a certain value (sensitivity), we just discard those
 * @return null {recommender_similarity} will be filled with similarity data
 */
function recommender_similarity_classical($app_name, $table_name, $field_mouse, $field_cheese, $field_weight, $options = array()) { 
  // get param value
  $app_id = recommender_get_app_id($app_name);
  $op = $options['performance'];
  if (!isset($op) || $op == 'auto') {
    // decide the scale, then choose which algorithm to use.
    $result = db_result(db_query("SELECT COUNT(DISTINCT $field_mouse) count_mouse, COUNT(DISTINCT $field_mouse) count_mouse FROM {{$table_name}}"));
    $op = ($result['count_mouse']<=2000 && $result['count_cheese']<=10000 && $options['missing']=='zero') ? 'memory' : 'database';
  }

  if ($op == 'memory') {
    _recommender_similarity_classical_in_memory($app_id, $table_name, $field_mouse, $field_cheese, $field_weight, $options);
  }
  else if ($op == 'database') {
    _recommender_similarity_classical_in_database($app_id, $table_name, $field_mouse, $field_cheese, $field_weight, $options);
  }
}


/**
 * Matrix computation in memory. Fast. But require lots of memory.
 * Limitation: only support $missing='zero', will cover other cases in future release
 */
function _recommender_similarity_classical_in_memory($app_id, $table_name, $field_mouse, $field_cheese, $field_weight, $options) {
  $created = time();
  // retrieve value from the database. setup program.
  $list_mouse = array();
  $list_cheese = array();
  $prep_matrix = array();
  $result = db_query("SELECT $field_mouse field_mouse, $field_cheese field_cheese, SUM($field_weight) field_weight
                      FROM {{$table_name}} GROUP BY field_mouse, field_cheese");
  while ($tuple = db_fetch_array($result)) {
    $id_mouse = $tuple['field_mouse'];
    $id_cheese = $tuple['field_cheese'];
    $weight = $tuple['field_weight'];
    if (!array_key_exists($id_mouse, $list_mouse)) {
      $list_mouse[$id_mouse] = count($list_mouse);
    }
    if (!array_key_exists($id_cheese, $list_cheese)) {
      $list_cheese[$id_cheese] = count($list_cheese);
    }
    $prep_matrix[$list_mouse[$id_mouse]][$list_cheese[$id_cheese]] = $weight;
  }

  // create the matrix
  $m = count($list_mouse);
  $n = count($list_cheese);
  $matrix = array();
  for ($v1=0; $v1<$m; $v1++) {
    for ($v2=0; $v2<$n; $v2++) {
      // basically set the missing values to be 0.
      $matrix[$v1][$v2] = isset($prep_matrix[$v1][$v2]) ? $prep_matrix[$v1][$v2] : 0;
    }
  }

  // time consuming calculation
  $cor_matrix = &_recommender_fast_correlation_coefficient($matrix);

  // save to database
  // TODO: batch insert to improve performance.
  $sensitivity = isset($options['sensitivity']) ? $options['sensitivity'] : 0.0001;
  $map = array_flip($list_mouse);
  for ($v1=0; $v1<$m; $v1++) {
    for ($v2=0; $v2<$m; $v2++) {
      $score = $cor_matrix[$v1][$v2];
      if (abs($score) > $sensitivity) {
        db_query("INSERT INTO {recommender_similarity}(app_id, mouse1_id, mouse2_id, similarity, created)
                  VALUE(%d, %d, %d, %f, %d)", $app_id, $map[$v1], $map[$v2], $score, $created);
      }
    }
  }
  db_query("DELETE FROM {recommender_similarity} WHERE app_id=%d AND created<>%d", $app_id, $created);
}

/**
 * Fast correlation matrix calculation.
 * @param $matrix has to be indexed by [0..m][0..n]. it WILL BE MODIFIED to save mem space!!!
 * @return array the correlation matrix
 */
function &_recommender_fast_correlation_coefficient(&$matrix) {
  if (!isset($matrix) || !is_array($matrix)) {
    return NULL;
  }

  $m = count($matrix);
  $n = count($matrix[0]); // assume the matrix has the same width
  $variance = array();
  foreach ($matrix as &$vector) {
    if (count($vector) != $n) {
      return NULL;
    }
    // adjust each element in the vector by the mean, and calculate variance.
    $mean = array_sum($vector) / $n;
    $sigma = 0;
    for ($i=0; $i<$n; $i++) {
      $vector[$i] = $vector[$i] - $mean;
      $sigma += pow($vector[$i], 2);
    }
    // due to float point error, a certain small number could be indeed 0.
    // but it doesn't seem to hurt. so just comment them out.
    //$vector_diagnal[] = sqrt($sigma)<0.00001 ? 0 : sqrt($sigma);
    $variance[] = sqrt($sigma); // note we didn't divide it by n.
  }

  $cor_matrix = array();
  $product = array();
  for ($v1=0; $v1<$m; $v1++) {
    for ($v2=$v1; $v2<$m; $v2++) {
      $vector_1 = &$matrix[$v1];
      $vector_2 = &$matrix[$v2];
      if ($variance[$v1]==0 || $variance[$v2]==0) {
        $cor_matrix[$v1][$v2] = 0;
      }
      else {
        for ($i=0; $i<$n; $i++) {
          $product[$i] = $vector_1[$i] * $vector_2[$i];
        }
        $cor = array_sum($product) / ($variance[$v1]*$variance[$v2]);
        $cor_matrix[$v1][$v2] = $cor;
        $cor_matrix[$v2][$v1] = $cor;
      }
    }
  }

  return $cor_matrix;
}


/**
 * Classical algorithm computed in databases. Full functioning.
 * It has no memory limits because all data are saved in database.
 * But the performance is worse than in-memory computation.
 */
function _recommender_similarity_classical_in_database($app_id, $table_name, $field_mouse, $field_cheese, $field_weight, $options) { 
  // get param values
  $missing = isset($options['missing']) ? $options['missing'] : 'none';
  $created = time();

  // operations would be quite different for different missing data treament
  switch ($missing) {
    
    // in this case, we treat the missing data as missing.
    case 'none':
    default:
      // compute pair-wise avg/stddev for each mouse, which is different in each pair.
      db_query("DELETE FROM {recommender_helper_pair_stat}");
      db_query("INSERT INTO {recommender_helper_pair_stat}(id1, id2, avg1, avg2, stddev1, stddev2)
                SELECT n1.$field_mouse, n2.$field_mouse, AVG(n1.$field_weight), AVG(n2.$field_weight),
                STDDEV(n1.$field_weight), STDDEV(n2.$field_weight)
                FROM {{$table_name}} n1 INNER JOIN {{$table_name}} n2 ON n1.$field_cheese=n2.$field_cheese
                GROUP BY n1.$field_mouse, n2.$field_mouse");

      // compute similarity. would be time-consuming
      db_query("INSERT INTO {recommender_similarity}(app_id, mouse1_id, mouse2_id, similarity, created)
                SELECT %d, n1.$field_mouse, n2.$field_mouse,
                (AVG((n1.$field_weight-a.avg1)*(n2.$field_weight-a.avg2)))/(a.stddev1*a.stddev2) corr, %d
                FROM {{$table_name}} n1 INNER JOIN {{$table_name}} n2 ON n1.$field_cheese=n2.$field_cheese AND n1.$field_mouse<=n2.$field_mouse
                INNER JOIN {recommender_helper_pair_stat} a ON n1.$field_mouse=a.id1 AND n2.$field_mouse=a.id2
                GROUP BY n1.$field_mouse, n2.$field_mouse HAVING corr<>0", $app_id, $created);
      // insert duplicate ones (to save time, we didn't do half of the calculation in operation above)
      db_query("INSERT INTO {recommender_similarity}(app_id, mouse1_id, mouse2_id, similarity, created)
                SELECT app_id, mouse2_id, mouse1_id, similarity, created FROM {recommender_similarity}
                WHERE mouse1_id<mouse2_id AND app_id=%d AND created=%d", $app_id, $created);

      // clean up
      db_query("DELETE FROM {recommender_helper_pair_stat}");
      break;

      
    // in this case, we treat the missing data as default value 0;
    case 'zero':
    // this case is similar to 'zero', but adjust the calculation to be more efficient for very sparse data.
    // refer to README file
    case 'adjusted':
      // expand the matrix to have zeros for missing data.
      _recommender_expand_sparse_data($table_name, $field_mouse, $field_cheese, $field_weight, $missing);

      // calcualate mean and stddev for each vector      
      db_query("DELETE FROM {recommender_helper_single_stat}");
      db_query("INSERT INTO {recommender_helper_single_stat}(id, avg, stddev)
                SELECT mouse_id, AVG(weight), STDDEV(weight) FROM {recommender_helper_matrix}
                GROUP BY mouse_id");
    
      // addjust the full matrix by the mean of each vector.
      db_query("UPDATE {recommender_helper_matrix} m, {recommender_helper_single_stat} s
                SET m.weight=m.weight-s.avg WHERE m.mouse_id=s.id");
  
      // we skip those mice that don't share cheese with other mice in 'adjusted' mode.
      $sql_adjusted = ($missing!='adjusted') ? '' :
            "INNER JOIN {recommender_helper_pair_stat} p ON n1.mouse_id=p.id1 AND n2.mouse_id=p.id2";
      
      // compute similarity. could be time-consuming.
      db_query("INSERT INTO {recommender_similarity}(app_id, mouse1_id, mouse2_id, similarity, created)
                SELECT %d, n1.mouse_id, n2.mouse_id,
                (AVG(n1.weight*n2.weight))/(s1.stddev*s2.stddev) corr, %d
                FROM {recommender_helper_matrix} n1 INNER JOIN {recommender_helper_matrix} n2
                ON n1.cheese_id=n2.cheese_id AND n1.mouse_id<=n2.mouse_id $sql_adjusted
                INNER JOIN {recommender_helper_single_stat} s1 ON n1.mouse_id=s1.id
                INNER JOIN {recommender_helper_single_stat} s2 ON n2.mouse_id=s2.id
                WHERE s1.stddev<>0 AND s2.stddev<>0
                GROUP BY n1.mouse_id, n2.mouse_id HAVING corr<>0", $app_id, $created);
      // insert duplicate ones (to save time, we didn't do half of the calculation in operation above)
      db_query("INSERT INTO {recommender_similarity}(app_id, mouse1_id, mouse2_id, similarity, created)
                SELECT app_id, mouse2_id, mouse1_id, similarity, created FROM {recommender_similarity}
                WHERE mouse1_id<mouse2_id AND app_id=%d AND created=%d", $app_id, $created);

      // clear up temporary data
      if ($missing=='adjusted') {
        db_query("DELETE FROM {recommender_helper_pair_stat}");
      }
      db_query("DELETE FROM {recommender_helper_single_stat}");
      db_query("DELETE FROM {recommender_helper_matrix}");
      break;
  }
  
  // remove old data.
  db_query("DELETE FROM {recommender_similarity} WHERE app_id=%d AND created<>%d", $app_id, $created);
}

/**
 * Expand the sparse database table to have the missing data default as zero.
 * 'adjusted' means that only mouses that share common cheese will be expanded.
 * 
 * @param $table_name the input table name
 * @param $field_mouse the input table field for mouse
 * @param $field_cheese the input table field for cheese
 * @param $field_weight the input table field weight
 * @param $missing the way of handling missing data. could be either 'zero' or 'adjusted'.
 * @return null the {recommender_helper_matrix} and {recommender_helper_pair_stat} will be filled with data.
 */
function _recommender_expand_sparse_data($table_name, $field_mouse, $field_cheese, $field_weight, $missing) {  
  // clean up
  db_query("DELETE FROM {recommender_helper_matrix}");

  // if 'adjusted', we record the pairs of mice that share at least one cheese.
  // we'll skip those mice that don't share cheese with other mice.
  // this measure is extremely useful for very sparse dataset.
  if ($missing=='adjusted') {
    db_query("DELETE FROM {recommender_helper_pair_stat}");
    db_query("INSERT INTO {recommender_helper_pair_stat}(id1, id2)
              SELECT n1.$field_mouse, n2.$field_mouse FROM {{$table_name}} n1
              INNER JOIN {{$table_name}} n2 ON n1.$field_cheese=n2.$field_cheese
              GROUP BY n1.$field_mouse, n2.$field_mouse");
  }

  // expand sparse matrix in {$table_name} to the full matrix, saved to {recommender_helper_matrix}, initially as 0
  // if 'adjusted', then we skip those mice that don't share at least one cheese with other mice.
  $sql_adjusted = ($missing!='adjusted') ? '' :
        "INNER JOIN (SELECT DISTINCT id1 FROM {recommender_helper_pair_stat}) p ON l.$field_mouse=p.id1";
  db_query("INSERT INTO {recommender_helper_matrix} SELECT m.mouse_id, c.cheese_id, 0 FROM
            (SELECT DISTINCT l.$field_mouse mouse_id FROM {{$table_name}} l $sql_adjusted) m JOIN
            (SELECT DISTINCT $field_cheese cheese_id FROM {{$table_name}}) c");

  // update the full matrix to have the correct value where cell is not 0
  db_query("UPDATE {recommender_helper_matrix} m, {{$table_name}} l SET m.weight=l.$field_weight
            WHERE m.mouse_id=l.$field_mouse AND m.cheese_id=l.$field_cheese");
}


/**
 * Classical weight-average algorithm to calculate prediction from the similarity matrix, based on average weight.
 * Limitation: we only do database operation for now. no in-memory operation available until future release.
 * Limitation: we only do average weight. regression-based weight maybe included in future release.
 * 
 * @param $app_name the application name that uses this function.
 * @param $table_name the input table name
 * @param $field_mouse the input table field for mouse
 * @param $field_cheese the input table field for cheese
 * @param $field_weight the input table field weight
 * @param $options an array of options
 *          'missing': how to handle missing data -- 'none' do nothing; 'zero' fill in missing data with zero; 'adjusted' skip mice that don't share cheese in common.
 *          'duplicate': how to handle predictions that already exists in mouse-cheese evaluation. 'preserve' or 'eliminate'
 *          'sensitivity': if similarity is smaller enough to be less than a certain value, we treat it as zero
 * @return null {recommender_prediction} will be filled with prediction data
 */
function recommender_prediction_classical($app_name, $table_name = 'recommender_link', $field_mouse = 'mouse_id',
                                          $field_cheese = 'cheese_id', $field_weight = 'weight', $options = array()) {
  // get param values
  $app_id = recommender_get_app_id($app_name);
  $missing = isset($options['missing']) ? $options['missing'] : 'none';
  $duplicate = isset($options['duplicate']) ? $options['duplicate'] : 'preserve'; // could be 'eliminate'
  $sensitivity = isset($options['sensitivity']) ? $options['sensitivity'] : 0;
  $created = time();
  
  // append missing data with 0s. then use table {recommender_helper_matrix} instead.
  if ($missing == 'zero' || $missing == 'adjusted') {
    _recommender_expand_sparse_data($table_name, $field_mouse, $field_cheese, $field_weight, $missing);
    $table_name = '{recommender_helper_matrix}';
    $field_mouse = 'mouse_id';
    $field_cheese = 'cheese_id';
    $field_weight = 'weight';
  }
  
  // calculate the mean value for each mouse, will be used as the starting point of each prediction.
  db_query("DELETE FROM {recommender_helper_single_stat}");
  db_query("INSERT INTO {recommender_helper_single_stat}(id, avg)
            SELECT $field_mouse, AVG($field_weight) FROM {{$table_name}} GROUP BY $field_mouse");

  // for some reason, the following SQL, esp."INNER JOIN {recommender_helper_single_stat} m1 ON s.mouse1_id=m1.id"
  // is really time-consuming. probably it's also in the GROUP BY?
  // changed to the 2ne SQL approach is much more efficient.
  /* db_query("INSERT INTO {recommender_prediction}(app_id, mouse_id, cheese_id, prediction, created)
            SELECT %d, s.mouse1_id, t.$field_cheese,
            m1.avg+SUM(s.similarity*(t.$field_weight-m2.avg))/SUM(ABS(s.similarity)) prediction, %d
            FROM {recommender_similarity} s INNER JOIN {recommender_helper_single_stat} m1
            ON s.mouse1_id=m1.id INNER JOIN {recommender_helper_single_stat} m2
            ON s.mouse2_id=m2.id INNER JOIN {{$table_name}} t ON s.mouse2_id=t.$field_mouse
            WHERE s.app_id=%d AND ABS(s.similarity)>%f GROUP BY s.mouse1_id, t.$field_cheese",
            $app_id, $created, $app_id, $sensitivity); */
  
  // if we treat missing data as adjusted, then we only count opinions from mice who share cheese from other mice.
  $sql_adjusted = ($missing!='adjusted') ? '' :
    "INNER JOIN {recommender_helper_pair_stat} p ON s.mouse1_id=p.id1 AND s.mouse2_id=p.id2";
  
  // calculate prediction. could be time-consuming
  db_query("INSERT INTO {recommender_prediction}(app_id, mouse_id, cheese_id, prediction, created)
            SELECT %d, s.mouse1_id, t.$field_cheese,
            SUM(s.similarity*(t.$field_weight-m.avg))/SUM(ABS(s.similarity)) prediction, %d
            FROM {recommender_similarity} s INNER JOIN {recommender_helper_single_stat} m
            ON s.mouse2_id=m.id INNER JOIN {{$table_name}} t ON s.mouse2_id=t.$field_mouse
            $sql_adjusted
            WHERE s.app_id=%d AND ABS(s.similarity)>%f GROUP BY s.mouse1_id, t.$field_cheese",
            $app_id, $created, $app_id, $sensitivity);
            
  // adjust the prediction based on each mouse's average evaluation.
  db_query("UPDATE {recommender_prediction} p, {recommender_helper_single_stat} m
            SET prediction=prediction+m.avg WHERE p.mouse_id=m.id
            AND p.app_id=%d AND created=%d", $app_id, $created);

  // remove duplicate prediction
  if ($duplicate == 'eliminate') {
    if ($missing == 'zero' || $missing == 'adjusted') {
      $missing_sql = "WHERE $field_weight<>0";
    }
    db_query("DELETE FROM {recommender_prediction} WHERE app_id=%d AND created=%d AND
              (mouse_id, cheese_id) IN (SELECT $field_mouse, $field_cheese FROM {{$table_name}}
              $missing_sql)", $app_id, $created);
  }

  // clean_up
  db_query("DELETE FROM {recommender_helper_single_stat}");
  if ($missing == 'zero' || $missing == 'adjusted') {
    db_query("DELETE FROM {recommender_helper_matrix}");
    if ($missing == 'adjusted') {
      db_query("DELETE FROM {recommender_helper_pair_stat}");
    }
  }

  // remove old predictions from the last calculation
  db_query("DELETE FROM {recommender_prediction} WHERE app_id=%d AND created<>%d", $app_id, $created);
}


/**
 * Co-ocurrences algorithm that compute similarity among mice based on how many cheese they share.
 * 
 * @param $app_name the application name that uses this function.
 * @param $table_name the input table name
 * @param $field_mouse the input table field for mouse
 * @param $field_cheese the input table field for cheese
 * @param $options an array of options
 *          'incremental': whether to rebuild the whole similarity matrix, or incrementally update those got changed. 'rebuild' or 'update'
 * @return null {recommender_similarity} will be filled with similarity data
 */
function recommender_similarity_coocurrences($app_name, $table_name, $field_mouse, $field_cheese, $options = array()) {
  // get param values
  $app_id = recommender_get_app_id($app_name);
  $created = time();
  $incremental = isset($options['incremental']) ? $options['incremental'] : 'rebuild';

  // the basic skeleton sql
  $basic_sql = "INSERT INTO {recommender_similarity}(app_id, mouse1_id, mouse2_id, similarity, created)
                SELECT $app_id, n1.$field_mouse, n2.$field_mouse, COUNT(n1.$field_cheese), $created
                FROM {{$table_name}} n1 INNER JOIN {{$table_name}} n2 ON n1.$field_cheese=n2.$field_cheese
                GROUP BY n1.$field_mouse, n2.$field_mouse";
  
  switch ($incremental) {
    // rebuild the similarity index
    case 'rebuild':
    default:
      db_query($basic_sql);
      db_query("DELETE FROM {recommender_updated_queue} WHERE app_id=%d AND created<=%d", $app_id, $created);
      db_query("DELETE FROM {recommender_similarity} WHERE app_id=%d AND created<>%d", $app_id, $created);
      break;

    // incrementally update. might incur more overhead than 'rebuild'
    case 'update':
      $max_timestamp = _recommender_updated_max($app_id);
      if ($max_timestamp == NULL) return;
      $ids = _recommender_updated_list($app_id, $max_timestamp, 'mouse_id');
      db_query("DELETE FROM {recommender_similarity} WHERE app_id=%d AND (mouse1_id IN $ids OR mouse2_id IN $ids)", $app_id);
      db_query($basic_sql ." HAVING (n1.$field_mouse IN $ids OR n2.$field_mouse IN $ids)");
      _recommender_updated_markdone($app_id, $max_timestamp);
      break;
  }

}


/**
 * This is the implementation of slope-one algorithm, which is supposed to be faster than other algrithms.
 * From the original research paper, the author argues slope-one support incremental update. But this is quite hard to implement.
 * The incremental update doesn't work for now.
 * Missing data is just treated as missing. For slope-one, we don't automatically expand the matrix to have zeros.
 * The responsibility of handling missing data is on the caller functions.
 * 
 * @param $app_name the application name that uses this function.
 * @param $table_name the input table name
 * @param $field_mouse the input table field for mouse
 * @param $field_cheese the input table field for cheese
 * @param $field_weight the input table field weight
 * @param $options an array of options
 *          'extention': whether to use 'basic', 'weighted', or 'bipolar' extensions of the algorithm.
 *                       Please refer to the original research paper. Usually it could just be 'weighted'.
 *          'duplicate': how to handle predictions that already exists in mouse-cheese evaluation. 'preserve' or 'eliminate'
 *          'incremental': whether to 'rebuild' or to 'update'. CAUTION: 'update' doesn't work now.
 * @return null {recommender_prediction} will be filled with prediction data
 */
function recommender_prediction_slopeone($app_name, $table_name, $field_mouse, $field_cheese, $field_weight, $options = array()) {
  
  // get param values
  $app_id = recommender_get_app_id($app_name);
  $extension = isset($options['extension']) ? $options['extension'] : 'weighted'; // could be 'weighted', 'bipolar'
  $incremental = isset($options['incremental']) ? $options['incremental'] : 'rebuild'; // could be 'update'
  $duplicate = isset($options['duplicate']) ? $options['duplicate'] : 'eliminate'; // or 'preserve'
  $created = time();

  db_query("DELETE FROM {recommender_slopeone_dev} WHERE app_id=%d", $app_id);

  // create dev(i,j)
  db_query("INSERT INTO {recommender_slopeone_dev}(app_id, cheese1_id, cheese2_id, count, dev)
            SELECT %d, n1.$field_cheese, n2.$field_cheese,
            COUNT(*), AVG(n1.$field_weight-n2.$field_weight) FROM {{$table_name}} n1
            INNER JOIN {{$table_name}} n2 ON n1.$field_mouse=n2.$field_mouse
            AND n1.$field_cheese <> n2.$field_cheese
            GROUP BY n1.$field_cheese, n2.$field_cheese", $app_id);

  // create P(u,j)
  if ($extension == 'basic') {
    $extension_sql = "AVG(t.$field_weight+p.dev)";
  }
  else if ($extension == 'weighted') { // the 'weighted slope one'
    $extension_sql = "SUM((t.$field_weight+p.dev)*p.count)/SUM(p.count)";
  } // haven't implemented the "bipolar" extension of Slope One.

  // generate predictions.
  db_query("INSERT INTO {recommender_prediction}(app_id, mouse_id, cheese_id, prediction, created)
            SELECT %d, t.$field_mouse, p.cheese1_id, $extension_sql, %d
            FROM {recommender_slopeone_dev} p INNER JOIN {{$table_name}} t ON p.cheese2_id=t.$field_cheese
            GROUP BY t.$field_mouse, p.cheese1_id", $app_id, $created);

  // remove duplicate prediction
  if ($duplicate == 'eliminate') {
    db_query("DELETE FROM {recommender_prediction} WHERE app_id=%d AND created=%d AND (mouse_id, cheese_id)
              IN (SELECT $field_mouse, $field_cheese FROM {{$table_name}})", $app_id, $created);
  }

  db_query("DELETE FROM {recommender_prediction} WHERE app_id=%d AND created<>%d", $app_id, $created);
}


/////////////////////// Helper functions /////////////////////

/**
 * Get the application id from the application name.
 * @param $app_name
 * @return the id of the application.
 */
function recommender_get_app_id($app_name) {
  if (!isset($app_name) || empty($app_name)) {
    return NULL;
  }
  $id = db_result(@db_query("SELECT app_id FROM {recommender_app_map} WHERE app_name='%s'", $app_name));
  if (!isset($id) || empty($id)) {
    db_query("INSERT INTO {recommender_app_map}(app_name) VALUE('%s')", $app_name);
    $id = db_result(db_query("SELECT app_id FROM {recommender_app_map} WHERE app_name='%s'", $app_name));
  }
  return $id;
}

/*
 * Below is a set of incremental update helper functions
 * This is very experiment. Please use with caution.
 */

// Add a new record to the updated_queue for further process.
function recommender_updated_add($app_name, $mouse_id, $cheese_id, $old_value, $new_value) {
  $app_id = recommender_get_app_id($app_name);
  $created = time();
  if ($old_value == NULL) $old_value = 'NULL';
  if ($new_value == NULL) $new_value = 'NULL';
  db_query("INSERT INTO {recommender_updated_queue}(app_id, mouse_id, cheese_id, old, new, created)
            VALUE(%d, %d, %d, %s, %s, %d)", $app_id, $mouse_id, $cheese_id, $old_value, $new_value, $created);
}

// purge the updated queue
function recommender_updated_purge($app_name, $op = 'outdated') {
  $app_id = recommender_get_app_id($app_name);
  $op_sql = ($op == 'outdated') ? "AND status=-100" : '';
  db_query("DELETE FROM {recommender_updated_queue} WHERE app_id=%d $op_sql", $app_id);
}

// get the max timestamp from the updated queue
function _recommender_updated_max($app_id) {
  $updated_info = db_fetch_array(db_query("SELECT COUNT(*) count, MAX(created) max
          FROM {recommender_updated_queue} WHERE app_id=%d AND status!=-100", $app_id));
  // if nothing is updated, then do nothing.
  if ($updated_info['count'] == 0) return NULL;
  return $updated_info['max'];
}

// get the list of updated items in parenthesis.
function _recommender_updated_list($app_id, $max_timestamp, $field = 'mouse_id') {
    $result = db_query("SELECT DISTINCT $field FROM {recommender_updated_queue} WHERE app_id=%d AND created<=%d", $app_id, $max_timestamp);
    while ($r = db_fetch_array($result)) {
      $updated[] = $r[$field];
    }
    return '('. implode(',', $updated) .')';
}

// mark items in the updated queue as done.
function _recommender_updated_markdone($app_id, $max_timestamp) {
  db_query("UPDATE {recommender_updated_queue} SET status=-100 WHERE app_id=%d AND created<=%d", $app_id, $max_timestamp);
}