Uva Searchengines / protected / controllers / DatabaseController.php

<?php

class DatabaseController extends Controller
{
	private $stopWords = array("a", "an", "and", "are", "as", "at", "be", "but", "by", "for", "if", "in", "into", "is", "it", "no", "not", "of", "on", "or", "s", "such", "t", "that", "the", "their", "then", "there", "these", "they", "this", "to", "was", "will", "with");

	private $articleIdAndTermAndCount;

	private $wordsAndCount;
	private $wordsAndTfidf;
	private $wordArticleCount;

	private $docCount = 0;


	public function actionInsertWords() {
		$articleIdAndBody = $this->getArticles();

		$this->articleIdAndTermAndCount = array();
		$this->wordsAndCount = array();
		$this->wordsAndTfidf = array();
		$this->setArticleWordsAndCounts($articleIdAndBody);


		//        $this->insertWords();
		$this->setInsertTfidf();

		var_dump(count($this->wordsAndTfidf));
		echo number_format(memory_get_usage()) . "\\n";
		die();
	}

	private function getArticles() {
		//         = CHtml::listData(Article::model()->findAll(), 'id', 'body');
		//        $sql = "SELECT id, body FROM articles WHERE id >= " . $_GET['start'] . ' AND id <= ' . $_GET['end'];
		$sql = "SELECT id, body FROM articles";
		$command = Yii::app()->db->createCommand($sql);
		$dataReader = $command->query();
		$rows = $dataReader->readAll();
		$arr = array();
		foreach ($rows as $row) {
			$this->docCount++;
			$arr[$row['id']] = preg_replace("/[^a-z \d]/i", " ", strtolower($row['body']));
		}
		unset($rows);
		return $arr;
	}

	private function setArticleWordsAndCounts($articleIdAndBody) {
		foreach ($articleIdAndBody as $articleid => $body) {
			$terms = mb_split("\s", $body);

			foreach ($terms as $term) {
				if (empty($term) || is_numeric($term) || in_array($term, $this->stopWords)) {
					continue;
				}

				if (!isset($this->articleIdAndTermAndCount[$articleid][$term])) {
					$this->articleIdAndTermAndCount[$articleid][$term] = 0;
				}
				$this->articleIdAndTermAndCount[$articleid][$term]++;
				if (!isset($this->wordsAndCount[$term])) {
					$this->wordsAndCount[$term] = 0;
				}
				$this->wordsAndCount[$term]++;

				if (!isset($this->wordArticleCount[$term])) {
					$this->wordArticleCount[$term] = 0;
				}
				if ($this->articleIdAndTermAndCount[$articleid][$term] == 1) {
					$this->wordArticleCount[$term]++;
				}
				unset($term);
			}
			unset($terms);
		}
		unset($articleIdAndBody);
	}

	private function insertWords() {
		$command = Yii::app()->db->createCommand();
		$command->truncateTable('words');

		$insertSql = "INSERT INTO words (count, text) VALUES(:count,:text)";
		$insertCommand = Yii::app()->db->createCommand($insertSql);

		foreach ($this->wordsAndCount as $term => $termCount) {
			$insertCommand->bindParam(":count", $termCount, PDO::PARAM_STR);
			$insertCommand->bindParam(":text", $term, PDO::PARAM_STR);
			$insertCommand->execute();
		}
	}

	private function setInsertTfidf() {
//		$command = Yii::app()->db->createCommand();
//		$command->truncateTable('articles_word');

		$insertSql = "INSERT INTO articles_word (article_id, word_id, count, score) VALUES(:article_id,:word_id, :count, :score)";
		$countSql = 'SELECT COUNT(id) FROM articles_word WHERE article_id = :article_id;';
		$insertCommand = Yii::app()->db->createCommand($insertSql);
		$countCommand = Yii::app()->db->createCommand($countSql);

		foreach ($this->articleIdAndTermAndCount as $articleid => $termsCounts) {
			$countCommand->bindParam(":article_id", $articleid, PDO::PARAM_STR);
			$rowCount = $countCommand->queryScalar();
			$termCount = count($termsCounts);
			if ($rowCount != 0 || $rowCount == count($termCount)) {
				continue;
			}

			//			if ($count != 0) {
			//				$deleteSql = "DELETE FROM articles_word WHERE article_id = :article_id";
			//				$deleteCommand = Yii::app()->db->createCommand($deleteSql);
			//				$deleteCommand->bindParam(":article_id", $articleid, PDO::PARAM_STR);
			//				$deleteCommand->execute();
			//			}

			$articleTfidfs = array();
			foreach ($termsCounts as $term => $count) {
				$tfidf = ($count * log($this->docCount / $this->wordArticleCount[$term], 2));
				$articleTfidfs[$term] = $tfidf;
			}
			array_multisort($articleTfidfs, SORT_DESC, SORT_REGULAR);
			$slice = array_slice($articleTfidfs, 0, 50);
			foreach ($slice as $term => $tfidf) {
				$wordid = $this->array_key_index($this->wordsAndCount, $term) + 1;
				$count = $termsCounts[$term];
				$insertCommand->bindParam(":word_id", $wordid, PDO::PARAM_STR);
				$insertCommand->bindParam(":article_id", $articleid, PDO::PARAM_STR);
				$insertCommand->bindParam(":count", $count, PDO::PARAM_STR);
				$insertCommand->bindParam(":score", $tfidf, PDO::PARAM_STR);
				$insertCommand->execute();
			}
		}
	}

	/**
	 * Array key index
	 * @author Nate Ferrero
	 */
	function array_key_index(&$arr, $key) {
		$i = 0;
		foreach (array_keys($arr) as $k) {
			if ($k == $key) return $i;
			$i++;
		}
	}
}
Tip: Filter by directory path e.g. /media app.js to search for public/media/app.js.
Tip: Use camelCasing e.g. ProjME to search for ProjectModifiedEvent.java.
Tip: Filter by extension type e.g. /repo .js to search for all .js files in the /repo directory.
Tip: Separate your search with spaces e.g. /ssh pom.xml to search for src/ssh/pom.xml.
Tip: Use ↑ and ↓ arrow keys to navigate and return to view the file.
Tip: You can also navigate files with Ctrl+j (next) and Ctrl+k (previous) and view the file with Ctrl+o.
Tip: You can also navigate files with Alt+j (next) and Alt+k (previous) and view the file with Alt+o.