$sensitivity) { db_query("INSERT INTO {recommender_similarity}(app_id, mouse1_id, mouse2_id, similarity, created) VALUE(%d, %d, %d, %f, %d)", $app_id, $map[$v1], $map[$v2], $score, $created); } } } db_query("DELETE FROM {recommender_similarity} WHERE app_id=%d AND created<>%d", $app_id, $created); } /** * Fast correlation matrix calculation. * @param $matrix has to be indexed by [0..m][0..n]. it WILL BE MODIFIED to save mem space!!! * @return array the correlation matrix */ function &_recommender_fast_correlation_coefficient(&$matrix) { if (!isset($matrix) || !is_array($matrix)) { return NULL; } $m = count($matrix); $n = count($matrix[0]); // assume the matrix has the same width $variance = array(); foreach ($matrix as &$vector) { if (count($vector) != $n) { return NULL; } // adjust each element in the vector by the mean, and calculate variance. $mean = array_sum($vector) / $n; $sigma = 0; for ($i=0; $i<$n; $i++) { $vector[$i] = $vector[$i] - $mean; $sigma += pow($vector[$i], 2); } // due to float point error, a certain small number could be indeed 0. // but it doesn't seem to hurt. so just comment them out. //$vector_diagnal[] = sqrt($sigma)<0.00001 ? 0 : sqrt($sigma); $variance[] = sqrt($sigma); // note we didn't divide it by n. } $cor_matrix = array(); $product = array(); for ($v1=0; $v1<$m; $v1++) { for ($v2=$v1; $v2<$m; $v2++) { $vector_1 = &$matrix[$v1]; $vector_2 = &$matrix[$v2]; if ($variance[$v1]==0 || $variance[$v2]==0) { $cor_matrix[$v1][$v2] = 0; } else { for ($i=0; $i<$n; $i++) { $product[$i] = $vector_1[$i] * $vector_2[$i]; } $cor = array_sum($product) / ($variance[$v1]*$variance[$v2]); $cor_matrix[$v1][$v2] = $cor; $cor_matrix[$v2][$v1] = $cor; } } } return $cor_matrix; } /** * Classical algorithm computed in databases. Full functioning. * It has no memory limits because all data are saved in database. * But the performance is worse than in-memory computation. */ function _recommender_similarity_classical_in_database($app_id, $table_name, $field_mouse, $field_cheese, $field_weight, $options) { // get param values $missing = isset($options['missing']) ? $options['missing'] : 'none'; $created = time(); // operations would be quite different for different missing data treament switch ($missing) { // in this case, we treat the missing data as missing. case 'none': default: // compute pair-wise avg/stddev for each mouse, which is different in each pair. db_query("DELETE FROM {recommender_helper_pair_stat}"); db_query("INSERT INTO {recommender_helper_pair_stat}(id1, id2, avg1, avg2, stddev1, stddev2) SELECT n1.$field_mouse, n2.$field_mouse, AVG(n1.$field_weight), AVG(n2.$field_weight), STDDEV(n1.$field_weight), STDDEV(n2.$field_weight) FROM {{$table_name}} n1 INNER JOIN {{$table_name}} n2 ON n1.$field_cheese=n2.$field_cheese GROUP BY n1.$field_mouse, n2.$field_mouse"); // compute similarity. would be time-consuming db_query("INSERT INTO {recommender_similarity}(app_id, mouse1_id, mouse2_id, similarity, created) SELECT %d, n1.$field_mouse, n2.$field_mouse, (AVG((n1.$field_weight-a.avg1)*(n2.$field_weight-a.avg2)))/(a.stddev1*a.stddev2) corr, %d FROM {{$table_name}} n1 INNER JOIN {{$table_name}} n2 ON n1.$field_cheese=n2.$field_cheese AND n1.$field_mouse<=n2.$field_mouse INNER JOIN {recommender_helper_pair_stat} a ON n1.$field_mouse=a.id1 AND n2.$field_mouse=a.id2 GROUP BY n1.$field_mouse, n2.$field_mouse HAVING corr<>0", $app_id, $created); // insert duplicate ones (to save time, we didn't do half of the calculation in operation above) db_query("INSERT INTO {recommender_similarity}(app_id, mouse1_id, mouse2_id, similarity, created) SELECT app_id, mouse2_id, mouse1_id, similarity, created FROM {recommender_similarity} WHERE mouse1_id0 AND s2.stddev<>0 GROUP BY n1.mouse_id, n2.mouse_id HAVING corr<>0", $app_id, $created); // insert duplicate ones (to save time, we didn't do half of the calculation in operation above) db_query("INSERT INTO {recommender_similarity}(app_id, mouse1_id, mouse2_id, similarity, created) SELECT app_id, mouse2_id, mouse1_id, similarity, created FROM {recommender_similarity} WHERE mouse1_id%d", $app_id, $created); } /** * Expand the sparse database table to have the missing data default as zero. * 'adjusted' means that only mouses that share common cheese will be expanded. * * @param $table_name the input table name * @param $field_mouse the input table field for mouse * @param $field_cheese the input table field for cheese * @param $field_weight the input table field weight * @param $missing the way of handling missing data. could be either 'zero' or 'adjusted'. * @return null the {recommender_helper_matrix} and {recommender_helper_pair_stat} will be filled with data. */ function _recommender_expand_sparse_data($table_name, $field_mouse, $field_cheese, $field_weight, $missing) { // clean up db_query("DELETE FROM {recommender_helper_matrix}"); // if 'adjusted', we record the pairs of mice that share at least one cheese. // we'll skip those mice that don't share cheese with other mice. // this measure is extremely useful for very sparse dataset. if ($missing=='adjusted') { db_query("DELETE FROM {recommender_helper_pair_stat}"); db_query("INSERT INTO {recommender_helper_pair_stat}(id1, id2) SELECT n1.$field_mouse, n2.$field_mouse FROM {{$table_name}} n1 INNER JOIN {{$table_name}} n2 ON n1.$field_cheese=n2.$field_cheese GROUP BY n1.$field_mouse, n2.$field_mouse"); } // expand sparse matrix in {$table_name} to the full matrix, saved to {recommender_helper_matrix}, initially as 0 // if 'adjusted', then we skip those mice that don't share at least one cheese with other mice. $sql_adjusted = ($missing!='adjusted') ? '' : "INNER JOIN (SELECT DISTINCT id1 FROM {recommender_helper_pair_stat}) p ON l.$field_mouse=p.id1"; db_query("INSERT INTO {recommender_helper_matrix} SELECT m.mouse_id, c.cheese_id, 0 FROM (SELECT DISTINCT l.$field_mouse mouse_id FROM {{$table_name}} l $sql_adjusted) m JOIN (SELECT DISTINCT $field_cheese cheese_id FROM {{$table_name}}) c"); // update the full matrix to have the correct value where cell is not 0 db_query("UPDATE {recommender_helper_matrix} m, {{$table_name}} l SET m.weight=l.$field_weight WHERE m.mouse_id=l.$field_mouse AND m.cheese_id=l.$field_cheese"); } /** * Classical weight-average algorithm to calculate prediction from the similarity matrix, based on average weight. * Limitation: we only do database operation for now. no in-memory operation available until future release. * Limitation: we only do average weight. regression-based weight maybe included in future release. * * @param $app_name the application name that uses this function. * @param $table_name the input table name * @param $field_mouse the input table field for mouse * @param $field_cheese the input table field for cheese * @param $field_weight the input table field weight * @param $options an array of options * 'missing': how to handle missing data -- 'none' do nothing; 'zero' fill in missing data with zero; 'adjusted' skip mice that don't share cheese in common. * 'duplicate': how to handle predictions that already exists in mouse-cheese evaluation. 'preserve' or 'eliminate' * 'sensitivity': if similarity is smaller enough to be less than a certain value, we treat it as zero * @return null {recommender_prediction} will be filled with prediction data */ function recommender_prediction_classical($app_name, $table_name = 'recommender_link', $field_mouse = 'mouse_id', $field_cheese = 'cheese_id', $field_weight = 'weight', $options = array()) { // get param values $app_id = recommender_get_app_id($app_name); $missing = isset($options['missing']) ? $options['missing'] : 'none'; $duplicate = isset($options['duplicate']) ? $options['duplicate'] : 'preserve'; // could be 'eliminate' $sensitivity = isset($options['sensitivity']) ? $options['sensitivity'] : 0; $created = time(); // append missing data with 0s. then use table {recommender_helper_matrix} instead. if ($missing == 'zero' || $missing == 'adjusted') { _recommender_expand_sparse_data($table_name, $field_mouse, $field_cheese, $field_weight, $missing); $table_name = '{recommender_helper_matrix}'; $field_mouse = 'mouse_id'; $field_cheese = 'cheese_id'; $field_weight = 'weight'; } // calculate the mean value for each mouse, will be used as the starting point of each prediction. db_query("DELETE FROM {recommender_helper_single_stat}"); db_query("INSERT INTO {recommender_helper_single_stat}(id, avg) SELECT $field_mouse, AVG($field_weight) FROM {{$table_name}} GROUP BY $field_mouse"); // for some reason, the following SQL, esp."INNER JOIN {recommender_helper_single_stat} m1 ON s.mouse1_id=m1.id" // is really time-consuming. probably it's also in the GROUP BY? // changed to the 2ne SQL approach is much more efficient. /* db_query("INSERT INTO {recommender_prediction}(app_id, mouse_id, cheese_id, prediction, created) SELECT %d, s.mouse1_id, t.$field_cheese, m1.avg+SUM(s.similarity*(t.$field_weight-m2.avg))/SUM(ABS(s.similarity)) prediction, %d FROM {recommender_similarity} s INNER JOIN {recommender_helper_single_stat} m1 ON s.mouse1_id=m1.id INNER JOIN {recommender_helper_single_stat} m2 ON s.mouse2_id=m2.id INNER JOIN {{$table_name}} t ON s.mouse2_id=t.$field_mouse WHERE s.app_id=%d AND ABS(s.similarity)>%f GROUP BY s.mouse1_id, t.$field_cheese", $app_id, $created, $app_id, $sensitivity); */ // if we treat missing data as adjusted, then we only count opinions from mice who share cheese from other mice. $sql_adjusted = ($missing!='adjusted') ? '' : "INNER JOIN {recommender_helper_pair_stat} p ON s.mouse1_id=p.id1 AND s.mouse2_id=p.id2"; // calculate prediction. could be time-consuming db_query("INSERT INTO {recommender_prediction}(app_id, mouse_id, cheese_id, prediction, created) SELECT %d, s.mouse1_id, t.$field_cheese, SUM(s.similarity*(t.$field_weight-m.avg))/SUM(ABS(s.similarity)) prediction, %d FROM {recommender_similarity} s INNER JOIN {recommender_helper_single_stat} m ON s.mouse2_id=m.id INNER JOIN {{$table_name}} t ON s.mouse2_id=t.$field_mouse $sql_adjusted WHERE s.app_id=%d AND ABS(s.similarity)>%f GROUP BY s.mouse1_id, t.$field_cheese", $app_id, $created, $app_id, $sensitivity); // adjust the prediction based on each mouse's average evaluation. db_query("UPDATE {recommender_prediction} p, {recommender_helper_single_stat} m SET prediction=prediction+m.avg WHERE p.mouse_id=m.id AND p.app_id=%d AND created=%d", $app_id, $created); // remove duplicate prediction if ($duplicate == 'eliminate') { if ($missing == 'zero' || $missing == 'adjusted') { $missing_sql = "WHERE $field_weight<>0"; } db_query("DELETE FROM {recommender_prediction} WHERE app_id=%d AND created=%d AND (mouse_id, cheese_id) IN (SELECT $field_mouse, $field_cheese FROM {{$table_name}} $missing_sql)", $app_id, $created); } // clean_up db_query("DELETE FROM {recommender_helper_single_stat}"); if ($missing == 'zero' || $missing == 'adjusted') { db_query("DELETE FROM {recommender_helper_matrix}"); if ($missing == 'adjusted') { db_query("DELETE FROM {recommender_helper_pair_stat}"); } } // remove old predictions from the last calculation db_query("DELETE FROM {recommender_prediction} WHERE app_id=%d AND created<>%d", $app_id, $created); } /** * Co-ocurrences algorithm that compute similarity among mice based on how many cheese they share. * * @param $app_name the application name that uses this function. * @param $table_name the input table name * @param $field_mouse the input table field for mouse * @param $field_cheese the input table field for cheese * @param $options an array of options * 'incremental': whether to rebuild the whole similarity matrix, or incrementally update those got changed. 'rebuild' or 'update' * @return null {recommender_similarity} will be filled with similarity data */ function recommender_similarity_coocurrences($app_name, $table_name, $field_mouse, $field_cheese, $options = array()) { // get param values $app_id = recommender_get_app_id($app_name); $created = time(); $incremental = isset($options['incremental']) ? $options['incremental'] : 'rebuild'; // the basic skeleton sql $basic_sql = "INSERT INTO {recommender_similarity}(app_id, mouse1_id, mouse2_id, similarity, created) SELECT $app_id, n1.$field_mouse, n2.$field_mouse, COUNT(n1.$field_cheese), $created FROM {{$table_name}} n1 INNER JOIN {{$table_name}} n2 ON n1.$field_cheese=n2.$field_cheese GROUP BY n1.$field_mouse, n2.$field_mouse"; switch ($incremental) { // rebuild the similarity index case 'rebuild': default: db_query($basic_sql); db_query("DELETE FROM {recommender_updated_queue} WHERE app_id=%d AND created<=%d", $app_id, $created); db_query("DELETE FROM {recommender_similarity} WHERE app_id=%d AND created<>%d", $app_id, $created); break; // incrementally update. might incur more overhead than 'rebuild' case 'update': $max_timestamp = _recommender_updated_max($app_id); if ($max_timestamp == NULL) return; $ids = _recommender_updated_list($app_id, $max_timestamp, 'mouse_id'); db_query("DELETE FROM {recommender_similarity} WHERE app_id=%d AND (mouse1_id IN $ids OR mouse2_id IN $ids)", $app_id); db_query($basic_sql ." HAVING (n1.$field_mouse IN $ids OR n2.$field_mouse IN $ids)"); _recommender_updated_markdone($app_id, $max_timestamp); break; } } /** * This is the implementation of slope-one algorithm, which is supposed to be faster than other algrithms. * From the original research paper, the author argues slope-one support incremental update. But this is quite hard to implement. * The incremental update doesn't work for now. * Missing data is just treated as missing. For slope-one, we don't automatically expand the matrix to have zeros. * The responsibility of handling missing data is on the caller functions. * * @param $app_name the application name that uses this function. * @param $table_name the input table name * @param $field_mouse the input table field for mouse * @param $field_cheese the input table field for cheese * @param $field_weight the input table field weight * @param $options an array of options * 'extention': whether to use 'basic', 'weighted', or 'bipolar' extensions of the algorithm. * Please refer to the original research paper. Usually it could just be 'weighted'. * 'duplicate': how to handle predictions that already exists in mouse-cheese evaluation. 'preserve' or 'eliminate' * 'incremental': whether to 'rebuild' or to 'update'. CAUTION: 'update' doesn't work now. * @return null {recommender_prediction} will be filled with prediction data */ function recommender_prediction_slopeone($app_name, $table_name, $field_mouse, $field_cheese, $field_weight, $options = array()) { // get param values $app_id = recommender_get_app_id($app_name); $extension = isset($options['extension']) ? $options['extension'] : 'weighted'; // could be 'weighted', 'bipolar' $incremental = isset($options['incremental']) ? $options['incremental'] : 'rebuild'; // could be 'update' $duplicate = isset($options['duplicate']) ? $options['duplicate'] : 'eliminate'; // or 'preserve' $created = time(); db_query("DELETE FROM {recommender_slopeone_dev} WHERE app_id=%d", $app_id); // create dev(i,j) db_query("INSERT INTO {recommender_slopeone_dev}(app_id, cheese1_id, cheese2_id, count, dev) SELECT %d, n1.$field_cheese, n2.$field_cheese, COUNT(*), AVG(n1.$field_weight-n2.$field_weight) FROM {{$table_name}} n1 INNER JOIN {{$table_name}} n2 ON n1.$field_mouse=n2.$field_mouse AND n1.$field_cheese <> n2.$field_cheese GROUP BY n1.$field_cheese, n2.$field_cheese", $app_id); // create P(u,j) if ($extension == 'basic') { $extension_sql = "AVG(t.$field_weight+p.dev)"; } else if ($extension == 'weighted') { // the 'weighted slope one' $extension_sql = "SUM((t.$field_weight+p.dev)*p.count)/SUM(p.count)"; } // haven't implemented the "bipolar" extension of Slope One. // generate predictions. db_query("INSERT INTO {recommender_prediction}(app_id, mouse_id, cheese_id, prediction, created) SELECT %d, t.$field_mouse, p.cheese1_id, $extension_sql, %d FROM {recommender_slopeone_dev} p INNER JOIN {{$table_name}} t ON p.cheese2_id=t.$field_cheese GROUP BY t.$field_mouse, p.cheese1_id", $app_id, $created); // remove duplicate prediction if ($duplicate == 'eliminate') { db_query("DELETE FROM {recommender_prediction} WHERE app_id=%d AND created=%d AND (mouse_id, cheese_id) IN (SELECT $field_mouse, $field_cheese FROM {{$table_name}})", $app_id, $created); } db_query("DELETE FROM {recommender_prediction} WHERE app_id=%d AND created<>%d", $app_id, $created); } /////////////////////// Helper functions ///////////////////// /** * Get the application id from the application name. * @param $app_name * @return the id of the application. */ function recommender_get_app_id($app_name) { if (!isset($app_name) || empty($app_name)) { return NULL; } $id = db_result(@db_query("SELECT app_id FROM {recommender_app_map} WHERE app_name='%s'", $app_name)); if (!isset($id) || empty($id)) { db_query("INSERT INTO {recommender_app_map}(app_name) VALUE('%s')", $app_name); $id = db_result(db_query("SELECT app_id FROM {recommender_app_map} WHERE app_name='%s'", $app_name)); } return $id; } /* * Below is a set of incremental update helper functions * This is very experiment. Please use with caution. */ // Add a new record to the updated_queue for further process. function recommender_updated_add($app_name, $mouse_id, $cheese_id, $old_value, $new_value) { $app_id = recommender_get_app_id($app_name); $created = time(); if ($old_value == NULL) $old_value = 'NULL'; if ($new_value == NULL) $new_value = 'NULL'; db_query("INSERT INTO {recommender_updated_queue}(app_id, mouse_id, cheese_id, old, new, created) VALUE(%d, %d, %d, %s, %s, %d)", $app_id, $mouse_id, $cheese_id, $old_value, $new_value, $created); } // purge the updated queue function recommender_updated_purge($app_name, $op = 'outdated') { $app_id = recommender_get_app_id($app_name); $op_sql = ($op == 'outdated') ? "AND status=-100" : ''; db_query("DELETE FROM {recommender_updated_queue} WHERE app_id=%d $op_sql", $app_id); } // get the max timestamp from the updated queue function _recommender_updated_max($app_id) { $updated_info = db_fetch_array(db_query("SELECT COUNT(*) count, MAX(created) max FROM {recommender_updated_queue} WHERE app_id=%d AND status!=-100", $app_id)); // if nothing is updated, then do nothing. if ($updated_info['count'] == 0) return NULL; return $updated_info['max']; } // get the list of updated items in parenthesis. function _recommender_updated_list($app_id, $max_timestamp, $field = 'mouse_id') { $result = db_query("SELECT DISTINCT $field FROM {recommender_updated_queue} WHERE app_id=%d AND created<=%d", $app_id, $max_timestamp); while ($r = db_fetch_array($result)) { $updated[] = $r[$field]; } return '('. implode(',', $updated) .')'; } // mark items in the updated queue as done. function _recommender_updated_markdone($app_id, $max_timestamp) { db_query("UPDATE {recommender_updated_queue} SET status=-100 WHERE app_id=%d AND created<=%d", $app_id, $max_timestamp); }