add uid variable to b8 classes
This commit is contained in:
		| @@ -205,7 +205,7 @@ class b8 | |||||||
| 	 * @return float The rating between 0 (ham) and 1 (spam) | 	 * @return float The rating between 0 (ham) and 1 (spam) | ||||||
| 	 */ | 	 */ | ||||||
|  |  | ||||||
| 	public function classify($text) | 	public function classify($uid,$text) | ||||||
| 	{ | 	{ | ||||||
|  |  | ||||||
| 		# Validate the startup | 		# Validate the startup | ||||||
| @@ -217,7 +217,7 @@ class b8 | |||||||
|  |  | ||||||
| 		# Get the internal database variables, containing the number of ham and | 		# Get the internal database variables, containing the number of ham and | ||||||
| 		# spam texts so the spam probability can be calculated in relation to them | 		# spam texts so the spam probability can be calculated in relation to them | ||||||
| 		$internals = $this->_database->get_internals(); | 		$internals = $this->_database->get_internals($uid); | ||||||
|  |  | ||||||
| 		# Calculate the spamminess of all tokens | 		# Calculate the spamminess of all tokens | ||||||
|  |  | ||||||
| @@ -231,7 +231,7 @@ class b8 | |||||||
| 			return $tokens; | 			return $tokens; | ||||||
|  |  | ||||||
| 		# Fetch all availible data for the token set from the database | 		# Fetch all availible data for the token set from the database | ||||||
| 		$this->_token_data = $this->_database->get(array_keys($tokens)); | 		$this->_token_data = $this->_database->get(array_keys($tokens),$uid); | ||||||
|  |  | ||||||
| 		# Calculate the spamminess and importance for each token (or a degenerated form of it) | 		# Calculate the spamminess and importance for each token (or a degenerated form of it) | ||||||
|  |  | ||||||
| @@ -441,9 +441,9 @@ class b8 | |||||||
| 	 * @return void | 	 * @return void | ||||||
| 	 */ | 	 */ | ||||||
|  |  | ||||||
| 	public function learn($text, $category) | 	public function learn($text, $category, $uid) | ||||||
| 	{ | 	{ | ||||||
| 		return $this->_process_text($text, $category, self::LEARN); | 		return $this->_process_text($text, $category, self::LEARN, $uid); | ||||||
| 	} | 	} | ||||||
|  |  | ||||||
| 	/** | 	/** | ||||||
| @@ -455,9 +455,9 @@ class b8 | |||||||
| 	 * @return void | 	 * @return void | ||||||
| 	 */ | 	 */ | ||||||
|  |  | ||||||
| 	public function unlearn($text, $category) | 	public function unlearn($text, $category, $uid) | ||||||
| 	{ | 	{ | ||||||
| 		return $this->_process_text($text, $category, self::UNLEARN); | 		return $this->_process_text($text, $category, self::UNLEARN, $uid); | ||||||
| 	} | 	} | ||||||
|  |  | ||||||
| 	/** | 	/** | ||||||
| @@ -470,7 +470,7 @@ class b8 | |||||||
| 	 * @return void | 	 * @return void | ||||||
| 	 */ | 	 */ | ||||||
|  |  | ||||||
| 	private function _process_text($text, $category, $action) | 	private function _process_text($text, $category, $action, $uid = 0) | ||||||
| 	{ | 	{ | ||||||
|  |  | ||||||
| 		# Validate the startup | 		# Validate the startup | ||||||
| @@ -494,7 +494,7 @@ class b8 | |||||||
| 			return $tokens; | 			return $tokens; | ||||||
|  |  | ||||||
| 		# Pass the tokens and what to do with it to the storage backend | 		# Pass the tokens and what to do with it to the storage backend | ||||||
| 		return $this->_database->process_text($tokens, $category, $action); | 		return $this->_database->process_text($tokens, $category, $action, $uid); | ||||||
|  |  | ||||||
| 	} | 	} | ||||||
|  |  | ||||||
|   | |||||||
							
								
								
									
										503
									
								
								library/spam/b8/b8.php.ORIG
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										503
									
								
								library/spam/b8/b8.php.ORIG
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,503 @@ | |||||||
|  | <?php | ||||||
|  |  | ||||||
|  | #   Copyright (C) 2006-2010 Tobias Leupold <tobias.leupold@web.de> | ||||||
|  | # | ||||||
|  | #   b8 - A Bayesian spam filter written in PHP 5 | ||||||
|  | # | ||||||
|  | #   This program is free software; you can redistribute it and/or modify it | ||||||
|  | #   under the terms of the GNU Lesser General Public License as published by | ||||||
|  | #   the Free Software Foundation in version 2.1 of the License. | ||||||
|  | # | ||||||
|  | #   This program is distributed in the hope that it will be useful, but | ||||||
|  | #   WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY | ||||||
|  | #   or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public | ||||||
|  | #   License for more details. | ||||||
|  | # | ||||||
|  | #   You should have received a copy of the GNU Lesser General Public License | ||||||
|  | #   along with this program; if not, write to the Free Software Foundation, | ||||||
|  | #   Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. | ||||||
|  |  | ||||||
|  | /** | ||||||
|  |  * Copyright (C) 2006-2010 Tobias Leupold <tobias.leupold@web.de> | ||||||
|  |  * | ||||||
|  |  * @license LGPL | ||||||
|  |  * @access public | ||||||
|  |  * @package b8 | ||||||
|  |  * @author Tobias Leupold | ||||||
|  |  * @author Oliver Lillie (aka buggedcom) (original PHP 5 port) | ||||||
|  |  */ | ||||||
|  |  | ||||||
|  | class b8 | ||||||
|  | { | ||||||
|  |  | ||||||
|  | 	public $config = array( | ||||||
|  | 		'min_size'      => 3, | ||||||
|  | 		'max_size'      => 30, | ||||||
|  | 		'allow_numbers' => FALSE, | ||||||
|  | 		'lexer'         => 'default', | ||||||
|  | 		'degenerator'   => 'default', | ||||||
|  | 		'storage'       => 'dba', | ||||||
|  | 		'use_relevant'  => 15, | ||||||
|  | 		'min_dev'       => 0.2, | ||||||
|  | 		'rob_s'         => 0.3, | ||||||
|  | 		'rob_x'         => 0.5 | ||||||
|  | 	); | ||||||
|  |  | ||||||
|  | 	private $_lexer      = NULL; | ||||||
|  | 	private $_database   = NULL; | ||||||
|  | 	private $_token_data = NULL; | ||||||
|  |  | ||||||
|  | 	const SPAM    = 'spam'; | ||||||
|  | 	const HAM     = 'ham'; | ||||||
|  | 	const LEARN   = 'learn'; | ||||||
|  | 	const UNLEARN = 'unlearn'; | ||||||
|  |  | ||||||
|  | 	const STARTUP_FAIL_DATABASE = 'STARTUP_FAIL_DATABASE'; | ||||||
|  | 	const STARTUP_FAIL_LEXER    = 'STARTUP_FAIL_LEXER'; | ||||||
|  | 	const TRAINER_CATEGORY_FAIL = 'TRAINER_CATEGORY_FAIL'; | ||||||
|  |  | ||||||
|  | 	/** | ||||||
|  | 	 * Constructs b8 | ||||||
|  | 	 * | ||||||
|  | 	 * @access public | ||||||
|  | 	 * @return void | ||||||
|  | 	 */ | ||||||
|  |  | ||||||
|  | 	function __construct($config = array(), $database_config) | ||||||
|  | 	{ | ||||||
|  |  | ||||||
|  | 		# Validate config data | ||||||
|  |  | ||||||
|  | 		if(count($config) > 0) { | ||||||
|  |  | ||||||
|  | 			foreach ($config as $name=>$value) { | ||||||
|  |  | ||||||
|  | 				switch($name) { | ||||||
|  |  | ||||||
|  | 					case 'min_dev': | ||||||
|  | 					case 'rob_s': | ||||||
|  | 					case 'rob_x': | ||||||
|  | 						$this->config[$name] = (float) $value; | ||||||
|  | 						break; | ||||||
|  |  | ||||||
|  | 					case 'min_size': | ||||||
|  | 					case 'max_size': | ||||||
|  | 					case 'use_relevant': | ||||||
|  | 						$this->config[$name] = (int) $value; | ||||||
|  | 						break; | ||||||
|  |  | ||||||
|  | 					case 'allow_numbers': | ||||||
|  | 						$this->config[$name] = (bool) $value; | ||||||
|  | 						break; | ||||||
|  |  | ||||||
|  | 					case 'lexer': | ||||||
|  | 						$value = (string) strtolower($value); | ||||||
|  | 						$this->config[$name] = is_file(dirname(__FILE__) . DIRECTORY_SEPARATOR . 'lexer' . DIRECTORY_SEPARATOR . "lexer_" . $value . '.php') === TRUE ? $value : 'default'; | ||||||
|  | 						break; | ||||||
|  |  | ||||||
|  | 					case 'storage': | ||||||
|  | 						$this->config[$name] = (string) $value; | ||||||
|  | 						break; | ||||||
|  |  | ||||||
|  | 				} | ||||||
|  |  | ||||||
|  | 			} | ||||||
|  |  | ||||||
|  | 		} | ||||||
|  |  | ||||||
|  | 		# Setup the database backend | ||||||
|  |  | ||||||
|  | 		# Get the basic storage class used by all backends | ||||||
|  | 		if($this->load_class('b8_storage_base', dirname(__FILE__) . DIRECTORY_SEPARATOR . 'storage' . DIRECTORY_SEPARATOR . 'storage_base.php') === FALSE) | ||||||
|  | 			return; | ||||||
|  |  | ||||||
|  | 		# Get the degenerator we need | ||||||
|  | 		if($this->load_class('b8_degenerator_' . $this->config['degenerator'], dirname(__FILE__) . DIRECTORY_SEPARATOR . 'degenerator' . DIRECTORY_SEPARATOR . 'degenerator_' . $this->config['degenerator'] . '.php') === FALSE) | ||||||
|  | 			return; | ||||||
|  |  | ||||||
|  | 		# Get the actual storage backend we need | ||||||
|  | 		if($this->load_class('b8_storage_' . $this->config['storage'], dirname(__FILE__) . DIRECTORY_SEPARATOR . 'storage' . DIRECTORY_SEPARATOR . 'storage_' . $this->config['storage'] . '.php') === FALSE) | ||||||
|  | 			return; | ||||||
|  |  | ||||||
|  | 		# Setup the backend | ||||||
|  | 		$class = 'b8_storage_' . $this->config['storage']; | ||||||
|  | 		$this->_database = new $class( | ||||||
|  | 			$database_config, | ||||||
|  | 			$this->config['degenerator'], date('ymd') | ||||||
|  | 		); | ||||||
|  |  | ||||||
|  | 		# Setup the lexer class | ||||||
|  |  | ||||||
|  | 		if($this->load_class('b8_lexer_' . $this->config['lexer'], dirname(__FILE__) . DIRECTORY_SEPARATOR . 'lexer' . DIRECTORY_SEPARATOR . 'lexer_' . $this->config['lexer'] . '.php') === FALSE) | ||||||
|  | 			return; | ||||||
|  |  | ||||||
|  | 		$class = 'b8_lexer_' . $this->config['lexer']; | ||||||
|  | 		$this->_lexer = new $class( | ||||||
|  | 			array( | ||||||
|  | 				'min_size' => $this->config['min_size'], | ||||||
|  | 				'max_size' => $this->config['max_size'], | ||||||
|  | 				'allow_numbers' => $this->config['allow_numbers'] | ||||||
|  | 			) | ||||||
|  | 		); | ||||||
|  |  | ||||||
|  | 	} | ||||||
|  |  | ||||||
|  | 	/** | ||||||
|  | 	 * Load a class file if a class has not been defined yet. | ||||||
|  | 	 * | ||||||
|  | 	 * @access public | ||||||
|  | 	 * @return boolean Returns TRUE if everything is okay, otherwise FALSE. | ||||||
|  | 	 */ | ||||||
|  |  | ||||||
|  | 	public function load_class($class_name, $class_file) | ||||||
|  | 	{ | ||||||
|  |  | ||||||
|  | 		if(class_exists($class_name, FALSE) === FALSE) { | ||||||
|  |  | ||||||
|  | 			$included = require_once $class_file; | ||||||
|  |  | ||||||
|  | 			if($included === FALSE or class_exists($class_name, FALSE) === FALSE) | ||||||
|  | 				return FALSE; | ||||||
|  |  | ||||||
|  | 		} | ||||||
|  |  | ||||||
|  | 		return TRUE; | ||||||
|  |  | ||||||
|  | 	} | ||||||
|  |  | ||||||
|  | 	/** | ||||||
|  | 	 * Validates the class has all it needs to work. | ||||||
|  | 	 * | ||||||
|  | 	 * @access public | ||||||
|  | 	 * @return mixed Returns TRUE if everything is okay, otherwise an error code. | ||||||
|  | 	 */ | ||||||
|  |  | ||||||
|  | 	public function validate() | ||||||
|  | 	{ | ||||||
|  |  | ||||||
|  | 		if($this->_database === NULL) | ||||||
|  | 			return self::STARTUP_FAIL_DATABASE; | ||||||
|  |  | ||||||
|  | 		# Connect the database backend if we aren't connected yet | ||||||
|  |  | ||||||
|  | 		elseif($this->_database->connected === FALSE) { | ||||||
|  |  | ||||||
|  | 			$connection = $this->_database->connect(); | ||||||
|  |  | ||||||
|  | 			if($connection !== TRUE) | ||||||
|  | 				return $connection; | ||||||
|  |  | ||||||
|  | 		} | ||||||
|  |  | ||||||
|  | 		if($this->_lexer === NULL) | ||||||
|  | 			return self::STARTUP_FAIL_LEXER; | ||||||
|  |  | ||||||
|  | 		return TRUE; | ||||||
|  |  | ||||||
|  | 	} | ||||||
|  |  | ||||||
|  | 	/** | ||||||
|  | 	 * Classifies a text | ||||||
|  | 	 * | ||||||
|  | 	 * @access public | ||||||
|  | 	 * @package default | ||||||
|  | 	 * @param string $text | ||||||
|  | 	 * @return float The rating between 0 (ham) and 1 (spam) | ||||||
|  | 	 */ | ||||||
|  |  | ||||||
|  | 	public function classify($text) | ||||||
|  | 	{ | ||||||
|  |  | ||||||
|  | 		# Validate the startup | ||||||
|  |  | ||||||
|  | 		$started_up = $this->validate(); | ||||||
|  |  | ||||||
|  | 		if($started_up !== TRUE) | ||||||
|  | 			return $started_up; | ||||||
|  |  | ||||||
|  | 		# Get the internal database variables, containing the number of ham and | ||||||
|  | 		# spam texts so the spam probability can be calculated in relation to them | ||||||
|  | 		$internals = $this->_database->get_internals(); | ||||||
|  |  | ||||||
|  | 		# Calculate the spamminess of all tokens | ||||||
|  |  | ||||||
|  | 		# Get all tokens we want to rate | ||||||
|  |  | ||||||
|  | 		$tokens = $this->_lexer->get_tokens($text); | ||||||
|  |  | ||||||
|  | 		# Check if the lexer failed | ||||||
|  | 		# (if so, $tokens will be a lexer error code, if not, $tokens will be an array) | ||||||
|  | 		if(!is_array($tokens)) | ||||||
|  | 			return $tokens; | ||||||
|  |  | ||||||
|  | 		# Fetch all availible data for the token set from the database | ||||||
|  | 		$this->_token_data = $this->_database->get(array_keys($tokens)); | ||||||
|  |  | ||||||
|  | 		# Calculate the spamminess and importance for each token (or a degenerated form of it) | ||||||
|  |  | ||||||
|  | 		$word_count = array(); | ||||||
|  | 		$rating     = array(); | ||||||
|  | 		$importance = array(); | ||||||
|  |  | ||||||
|  | 		foreach($tokens as $word => $count) { | ||||||
|  |  | ||||||
|  | 			$word_count[$word] = $count; | ||||||
|  |  | ||||||
|  | 			# Although we only call this function only here ... let's do the | ||||||
|  | 			# calculation stuff in a function to make this a bit less confusing ;-) | ||||||
|  | 			$rating[$word] = $this->_get_probability($word, $internals['texts_ham'], $internals['texts_spam']); | ||||||
|  |  | ||||||
|  | 			$importance[$word] = abs(0.5 - $rating[$word]); | ||||||
|  |  | ||||||
|  | 		} | ||||||
|  |  | ||||||
|  | 		# Order by importance | ||||||
|  | 		arsort($importance); | ||||||
|  | 		reset($importance); | ||||||
|  |  | ||||||
|  | 		# Get the most interesting tokens (use all if we have less than the given number) | ||||||
|  |  | ||||||
|  | 		$relevant = array(); | ||||||
|  |  | ||||||
|  | 		for($i = 0; $i < $this->config['use_relevant']; $i++) { | ||||||
|  |  | ||||||
|  | 			if($tmp = each($importance)) { | ||||||
|  |  | ||||||
|  | 				# Important tokens remain | ||||||
|  |  | ||||||
|  | 				# If the token's rating is relevant enough, use it | ||||||
|  |  | ||||||
|  | 				if(abs(0.5 - $rating[$tmp['key']]) > $this->config['min_dev']) { | ||||||
|  |  | ||||||
|  | 					# Tokens that appear more than once also count more than once | ||||||
|  |  | ||||||
|  | 					for($x = 0, $l = $word_count[$tmp['key']]; $x < $l; $x++) | ||||||
|  | 						array_push($relevant, $rating[$tmp['key']]); | ||||||
|  |  | ||||||
|  | 				} | ||||||
|  |  | ||||||
|  | 			} | ||||||
|  |  | ||||||
|  | 			else { | ||||||
|  | 				# We have less than words to use, so we already | ||||||
|  | 				# use what we have and can break here | ||||||
|  | 				break; | ||||||
|  | 			} | ||||||
|  |  | ||||||
|  | 		} | ||||||
|  |  | ||||||
|  | 		# Calculate the spamminess of the text (thanks to Mr. Robinson ;-) | ||||||
|  | 		# We set both hamminess and Spamminess to 1 for the first multiplying | ||||||
|  | 		$hamminess  = 1; | ||||||
|  | 		$spamminess = 1; | ||||||
|  |  | ||||||
|  | 		# Consider all relevant ratings | ||||||
|  | 		foreach($relevant as $value) { | ||||||
|  | 			$hamminess  *= (1.0 - $value); | ||||||
|  | 			$spamminess *= $value; | ||||||
|  | 		} | ||||||
|  |  | ||||||
|  | 		# If no token was good for calculation, we really don't know how | ||||||
|  | 		# to rate this text; so we assume a spam and ham probability of 0.5 | ||||||
|  |  | ||||||
|  | 		if($hamminess === 1 and $spamminess === 1) { | ||||||
|  | 			$hamminess = 0.5; | ||||||
|  | 			$spamminess = 0.5; | ||||||
|  | 			$n = 1; | ||||||
|  | 		} | ||||||
|  | 		else { | ||||||
|  | 			# Get the number of relevant ratings | ||||||
|  | 			$n = count($relevant); | ||||||
|  | 		} | ||||||
|  |  | ||||||
|  | 		# Calculate the combined rating | ||||||
|  |  | ||||||
|  | 		# The actual hamminess and spamminess | ||||||
|  | 		$hamminess  = 1 - pow($hamminess,  (1 / $n)); | ||||||
|  | 		$spamminess = 1 - pow($spamminess, (1 / $n)); | ||||||
|  |  | ||||||
|  | 		# Calculate the combined indicator | ||||||
|  | 		$probability = ($hamminess - $spamminess) / ($hamminess + $spamminess); | ||||||
|  |  | ||||||
|  | 		# We want a value between 0 and 1, not between -1 and +1, so ... | ||||||
|  | 		$probability = (1 + $probability) / 2; | ||||||
|  |  | ||||||
|  | 		# Alea iacta est | ||||||
|  | 		return $probability; | ||||||
|  |  | ||||||
|  | 	} | ||||||
|  |  | ||||||
|  | 	/** | ||||||
|  | 	 * Calculate the spamminess of a single token also considering "degenerated" versions | ||||||
|  | 	 * | ||||||
|  | 	 * @access private | ||||||
|  | 	 * @param string $word | ||||||
|  | 	 * @param string $texts_ham | ||||||
|  | 	 * @param string $texts_spam | ||||||
|  | 	 * @return void | ||||||
|  | 	 */ | ||||||
|  |  | ||||||
|  | 	private function _get_probability($word, $texts_ham, $texts_spam) | ||||||
|  | 	{ | ||||||
|  |  | ||||||
|  | 		# Let's see what we have! | ||||||
|  |  | ||||||
|  | 		if(isset($this->_token_data['tokens'][$word]) === TRUE) { | ||||||
|  | 			# The token was in the database, so we can use it's data as-is | ||||||
|  | 			# and calculate the spamminess of this token directly | ||||||
|  | 			return $this->_calc_probability($this->_token_data['tokens'][$word], $texts_ham, $texts_spam); | ||||||
|  | 		} | ||||||
|  |  | ||||||
|  | 		# Damn. The token was not found, so do we have at least similar words? | ||||||
|  |  | ||||||
|  | 		if(isset($this->_token_data['degenerates'][$word]) === TRUE) { | ||||||
|  |  | ||||||
|  | 			# We found similar words, so calculate the spamminess for each one | ||||||
|  | 			# and choose the most important one for the further calculation | ||||||
|  |  | ||||||
|  | 			# The default rating is 0.5 simply saying nothing | ||||||
|  | 			$rating = 0.5; | ||||||
|  |  | ||||||
|  | 			foreach($this->_token_data['degenerates'][$word] as $degenerate => $count) { | ||||||
|  |  | ||||||
|  | 				# Calculate the rating of the current degenerated token | ||||||
|  | 				$rating_tmp = $this->_calc_probability($count, $texts_ham, $texts_spam); | ||||||
|  |  | ||||||
|  | 				# Is it more important than the rating of another degenerated version? | ||||||
|  | 				if(abs(0.5 - $rating_tmp) > abs(0.5 - $rating)) | ||||||
|  | 					$rating = $rating_tmp; | ||||||
|  |  | ||||||
|  | 			} | ||||||
|  |  | ||||||
|  | 			return $rating; | ||||||
|  |  | ||||||
|  | 		} | ||||||
|  |  | ||||||
|  | 		else { | ||||||
|  | 			# The token is really unknown, so choose the default rating | ||||||
|  | 			# for completely unknown tokens. This strips down to the | ||||||
|  | 			# robX parameter so we can cheap out the freaky math ;-) | ||||||
|  | 			return $this->config['rob_x']; | ||||||
|  | 		} | ||||||
|  |  | ||||||
|  | 	} | ||||||
|  |  | ||||||
|  | 	/** | ||||||
|  | 	 * Do the actual spamminess calculation of a single token | ||||||
|  | 	 * | ||||||
|  | 	 * @access private | ||||||
|  | 	 * @param array $data | ||||||
|  | 	 * @param string $texts_ham | ||||||
|  | 	 * @param string $texts_spam | ||||||
|  | 	 * @return void | ||||||
|  | 	 */ | ||||||
|  |  | ||||||
|  | 	private function _calc_probability($data, $texts_ham, $texts_spam) | ||||||
|  | 	{ | ||||||
|  |  | ||||||
|  | 		# Calculate the basic probability by Mr. Graham | ||||||
|  |  | ||||||
|  | 		# But: consider the number of ham and spam texts saved instead of the | ||||||
|  | 		# number of entries where the token appeared to calculate a relative | ||||||
|  | 		# spamminess because we count tokens appearing multiple times not just | ||||||
|  | 		# once but as often as they appear in the learned texts | ||||||
|  |  | ||||||
|  | 		$rel_ham = $data['count_ham']; | ||||||
|  | 		$rel_spam = $data['count_spam']; | ||||||
|  |  | ||||||
|  | 		if($texts_ham > 0) | ||||||
|  | 			$rel_ham = $data['count_ham'] / $texts_ham; | ||||||
|  |  | ||||||
|  | 		if($texts_spam > 0) | ||||||
|  | 			$rel_spam = $data['count_spam'] / $texts_spam; | ||||||
|  |  | ||||||
|  | 		$rating = $rel_spam / ($rel_ham + $rel_spam); | ||||||
|  |  | ||||||
|  | 		# Calculate the better probability proposed by Mr. Robinson | ||||||
|  | 		$all = $data['count_ham'] + $data['count_spam']; | ||||||
|  | 		return (($this->config['rob_s'] * $this->config['rob_x']) + ($all * $rating)) / ($this->config['rob_s'] + $all); | ||||||
|  |  | ||||||
|  | 	} | ||||||
|  |  | ||||||
|  | 	/** | ||||||
|  | 	 * Check the validity of the category of a request | ||||||
|  | 	 * | ||||||
|  | 	 * @access private | ||||||
|  | 	 * @param string $category | ||||||
|  | 	 * @return void | ||||||
|  | 	 */ | ||||||
|  |  | ||||||
|  | 	private function _check_category($category) | ||||||
|  | 	{ | ||||||
|  | 		return $category === self::HAM or $category === self::SPAM; | ||||||
|  | 	} | ||||||
|  |  | ||||||
|  | 	/** | ||||||
|  | 	 * Learn a reference text | ||||||
|  | 	 * | ||||||
|  | 	 * @access public | ||||||
|  | 	 * @param string $text | ||||||
|  | 	 * @param const $category Either b8::SPAM or b8::HAM | ||||||
|  | 	 * @return void | ||||||
|  | 	 */ | ||||||
|  |  | ||||||
|  | 	public function learn($text, $category) | ||||||
|  | 	{ | ||||||
|  | 		return $this->_process_text($text, $category, self::LEARN); | ||||||
|  | 	} | ||||||
|  |  | ||||||
|  | 	/** | ||||||
|  | 	 * Unlearn a reference text | ||||||
|  | 	 * | ||||||
|  | 	 * @access public | ||||||
|  | 	 * @param string $text | ||||||
|  | 	 * @param const $category Either b8::SPAM or b8::HAM | ||||||
|  | 	 * @return void | ||||||
|  | 	 */ | ||||||
|  |  | ||||||
|  | 	public function unlearn($text, $category) | ||||||
|  | 	{ | ||||||
|  | 		return $this->_process_text($text, $category, self::UNLEARN); | ||||||
|  | 	} | ||||||
|  |  | ||||||
|  | 	/** | ||||||
|  | 	 * Does the actual interaction with the storage backend for learning or unlearning texts | ||||||
|  | 	 * | ||||||
|  | 	 * @access private | ||||||
|  | 	 * @param string $text | ||||||
|  | 	 * @param const $category Either b8::SPAM or b8::HAM | ||||||
|  | 	 * @param const $action Either b8::LEARN or b8::UNLEARN | ||||||
|  | 	 * @return void | ||||||
|  | 	 */ | ||||||
|  |  | ||||||
|  | 	private function _process_text($text, $category, $action) | ||||||
|  | 	{ | ||||||
|  |  | ||||||
|  | 		# Validate the startup | ||||||
|  |  | ||||||
|  | 		$started_up = $this->validate(); | ||||||
|  |  | ||||||
|  | 		if($started_up !== TRUE) | ||||||
|  | 			return $started_up; | ||||||
|  |  | ||||||
|  | 		# Look if the request is okay | ||||||
|  | 		if($this->_check_category($category) === FALSE) | ||||||
|  | 			return self::TRAINER_CATEGORY_FAIL; | ||||||
|  |  | ||||||
|  | 		# Get all tokens from $text | ||||||
|  |  | ||||||
|  | 		$tokens = $this->_lexer->get_tokens($text); | ||||||
|  |  | ||||||
|  | 		# Check if the lexer failed | ||||||
|  | 		# (if so, $tokens will be a lexer error code, if not, $tokens will be an array) | ||||||
|  | 		if(!is_array($tokens)) | ||||||
|  | 			return $tokens; | ||||||
|  |  | ||||||
|  | 		# Pass the tokens and what to do with it to the storage backend | ||||||
|  | 		return $this->_database->process_text($tokens, $category, $action); | ||||||
|  |  | ||||||
|  | 	} | ||||||
|  |  | ||||||
|  | } | ||||||
|  |  | ||||||
|  | ?> | ||||||
| @@ -71,10 +71,10 @@ abstract class b8_storage_base | |||||||
| 	 * @return mixed Returns TRUE if everything is okay, otherwise an error code. | 	 * @return mixed Returns TRUE if everything is okay, otherwise an error code. | ||||||
| 	 */ | 	 */ | ||||||
|  |  | ||||||
| 	protected function check_database() | 	protected function check_database($uid) | ||||||
| 	{ | 	{ | ||||||
|  |  | ||||||
| 		$internals = $this->get_internals(); | 		$internals = $this->get_internals($uid); | ||||||
|  |  | ||||||
| 		if(isset($internals['dbversion'])) { | 		if(isset($internals['dbversion'])) { | ||||||
| 			if($internals['dbversion'] == "2") { | 			if($internals['dbversion'] == "2") { | ||||||
| @@ -122,7 +122,7 @@ abstract class b8_storage_base | |||||||
| 	 * @return array Returns an array of all internals. | 	 * @return array Returns an array of all internals. | ||||||
| 	 */ | 	 */ | ||||||
|  |  | ||||||
| 	public function get_internals() | 	public function get_internals($uid) | ||||||
| 	{ | 	{ | ||||||
|  |  | ||||||
| 		$internals = $this->_get_query( | 		$internals = $this->_get_query( | ||||||
| @@ -130,7 +130,8 @@ abstract class b8_storage_base | |||||||
| 				self::INTERNALS_TEXTS_HAM, | 				self::INTERNALS_TEXTS_HAM, | ||||||
| 				self::INTERNALS_TEXTS_SPAM, | 				self::INTERNALS_TEXTS_SPAM, | ||||||
| 				self::INTERNALS_DBVERSION | 				self::INTERNALS_DBVERSION | ||||||
| 			) | 			), | ||||||
|  | 			$uid | ||||||
| 		); | 		); | ||||||
|  |  | ||||||
| 		return array( | 		return array( | ||||||
| @@ -149,7 +150,7 @@ abstract class b8_storage_base | |||||||
| 	 * @return mixed Returns FALSE on failure, otherwise returns array of returned data in the format array('tokens' => array(token => count), 'degenerates' => array(token => array(degenerate => count))). | 	 * @return mixed Returns FALSE on failure, otherwise returns array of returned data in the format array('tokens' => array(token => count), 'degenerates' => array(token => array(degenerate => count))). | ||||||
| 	 */ | 	 */ | ||||||
|  |  | ||||||
| 	public function get($tokens) | 	public function get($tokens, $uid) | ||||||
| 	{ | 	{ | ||||||
|  |  | ||||||
| 		# Validate the startup | 		# Validate the startup | ||||||
| @@ -160,7 +161,7 @@ abstract class b8_storage_base | |||||||
| 			return $started_up; | 			return $started_up; | ||||||
|  |  | ||||||
| 		# First we see what we have in the database. | 		# First we see what we have in the database. | ||||||
| 		$token_data = $this->_get_query($tokens); | 		$token_data = $this->_get_query($tokens, $uid); | ||||||
|  |  | ||||||
| 		# Check if we have to degenerate some tokens | 		# Check if we have to degenerate some tokens | ||||||
|  |  | ||||||
| @@ -203,7 +204,7 @@ abstract class b8_storage_base | |||||||
| 				$return_data_tokens[$token] = $this->_parse_count($token_data[$token]); | 				$return_data_tokens[$token] = $this->_parse_count($token_data[$token]); | ||||||
|  |  | ||||||
| 				# ... and update it's lastseen parameter | 				# ... and update it's lastseen parameter | ||||||
| 				$this->_update($token, "{$return_data_tokens[$token]['count_ham']} {$return_data_tokens[$token]['count_spam']} " . $this->b8_config['today']); | 				$this->_update($token, "{$return_data_tokens[$token]['count_ham']} {$return_data_tokens[$token]['count_spam']} " . $this->b8_config['today'], $uid ); | ||||||
|  |  | ||||||
| 			} | 			} | ||||||
|  |  | ||||||
| @@ -224,7 +225,7 @@ abstract class b8_storage_base | |||||||
| 						$return_data_degenerates[$token][$degenerate] = $this->_parse_count($token_data[$degenerate]); | 						$return_data_degenerates[$token][$degenerate] = $this->_parse_count($token_data[$degenerate]); | ||||||
|  |  | ||||||
| 						# ... and update it's lastseen parameter | 						# ... and update it's lastseen parameter | ||||||
| 						$this->_update($degenerate, "{$return_data_degenerates[$token][$degenerate]['count_ham']} {$return_data_degenerates[$token][$degenerate]['count_spam']} " . $this->b8_config['today']); | 						$this->_update($degenerate, "{$return_data_degenerates[$token][$degenerate]['count_ham']} {$return_data_degenerates[$token][$degenerate]['count_spam']} " . $this->b8_config['today'], $uid); | ||||||
|  |  | ||||||
| 					} | 					} | ||||||
|  |  | ||||||
| @@ -258,7 +259,7 @@ abstract class b8_storage_base | |||||||
| 	 * @return void | 	 * @return void | ||||||
| 	 */ | 	 */ | ||||||
|  |  | ||||||
| 	public function process_text($tokens, $category, $action) | 	public function process_text($tokens, $category, $action, $uid) | ||||||
| 	{ | 	{ | ||||||
|  |  | ||||||
| 		# Validate the startup | 		# Validate the startup | ||||||
| @@ -271,10 +272,10 @@ abstract class b8_storage_base | |||||||
| 		# No matter what we do, we first have to check what data we have. | 		# No matter what we do, we first have to check what data we have. | ||||||
|  |  | ||||||
| 		# First get the internals, including the ham texts and spam texts counter | 		# First get the internals, including the ham texts and spam texts counter | ||||||
| 		$internals = $this->get_internals(); | 		$internals = $this->get_internals($uid); | ||||||
|  |  | ||||||
| 		# Then, fetch all data for all tokens we have (and update their lastseen parameters) | 		# Then, fetch all data for all tokens we have (and update their lastseen parameters) | ||||||
| 		$token_data = $this->_get_query(array_keys($tokens)); | 		$token_data = $this->_get_query(array_keys($tokens), $uid); | ||||||
|  |  | ||||||
| 		# Process all tokens to learn/unlearn | 		# Process all tokens to learn/unlearn | ||||||
|  |  | ||||||
| @@ -315,7 +316,7 @@ abstract class b8_storage_base | |||||||
|  |  | ||||||
| 				# Now let's see if we have to update or delete the token | 				# Now let's see if we have to update or delete the token | ||||||
| 				if($count_ham !== 0 or $count_spam !== 0) | 				if($count_ham !== 0 or $count_spam !== 0) | ||||||
| 					$this->_update($token, "$count_ham $count_spam " . $this->b8_config['today']); | 					$this->_update($token, "$count_ham $count_spam " . $this->b8_config['today'], $uid); | ||||||
| 				else | 				else | ||||||
| 					$this->_del($token); | 					$this->_del($token); | ||||||
|  |  | ||||||
| @@ -335,7 +336,7 @@ abstract class b8_storage_base | |||||||
|  |  | ||||||
| 					$data .= $this->b8_config['today']; | 					$data .= $this->b8_config['today']; | ||||||
|  |  | ||||||
| 					$this->_put($token, $data); | 					$this->_put($token, $data, $uid); | ||||||
|  |  | ||||||
| 				} | 				} | ||||||
|  |  | ||||||
| @@ -349,12 +350,12 @@ abstract class b8_storage_base | |||||||
|  |  | ||||||
| 			if($category === b8::HAM) { | 			if($category === b8::HAM) { | ||||||
| 				$internals['texts_ham']++; | 				$internals['texts_ham']++; | ||||||
| 				$this->_update(self::INTERNALS_TEXTS_HAM, $internals['texts_ham']); | 				$this->_update(self::INTERNALS_TEXTS_HAM, $internals['texts_ham'], $uid); | ||||||
| 			} | 			} | ||||||
|  |  | ||||||
| 			elseif($category === b8::SPAM) { | 			elseif($category === b8::SPAM) { | ||||||
| 				$internals['texts_spam']++; | 				$internals['texts_spam']++; | ||||||
| 				$this->_update(self::INTERNALS_TEXTS_SPAM, $internals['texts_spam']); | 				$this->_update(self::INTERNALS_TEXTS_SPAM, $internals['texts_spam'], $uid); | ||||||
| 			} | 			} | ||||||
|  |  | ||||||
| 		} | 		} | ||||||
| @@ -368,7 +369,7 @@ abstract class b8_storage_base | |||||||
| 				if($internals['texts_ham'] < 0) | 				if($internals['texts_ham'] < 0) | ||||||
| 					$internals['texts_ham'] = 0; | 					$internals['texts_ham'] = 0; | ||||||
|  |  | ||||||
| 				$this->_update(self::INTERNALS_TEXTS_HAM, $internals['texts_ham']); | 				$this->_update(self::INTERNALS_TEXTS_HAM, $internals['texts_ham'], $uid); | ||||||
|  |  | ||||||
| 			} | 			} | ||||||
|  |  | ||||||
| @@ -379,7 +380,7 @@ abstract class b8_storage_base | |||||||
| 				if($internals['texts_spam'] < 0) | 				if($internals['texts_spam'] < 0) | ||||||
| 					$internals['texts_spam'] = 0; | 					$internals['texts_spam'] = 0; | ||||||
|  |  | ||||||
| 				$this->_update(self::INTERNALS_TEXTS_SPAM, $internals['texts_spam']); | 				$this->_update(self::INTERNALS_TEXTS_SPAM, $internals['texts_spam'], $uid); | ||||||
|  |  | ||||||
| 			} | 			} | ||||||
|  |  | ||||||
|   | |||||||
							
								
								
									
										395
									
								
								library/spam/b8/storage/storage_base.php.ORIG
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										395
									
								
								library/spam/b8/storage/storage_base.php.ORIG
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,395 @@ | |||||||
|  | <?php | ||||||
|  |  | ||||||
|  | #   Copyright (C) 2010 Tobias Leupold <tobias.leupold@web.de> | ||||||
|  | # | ||||||
|  | #   This file is part of the b8 package | ||||||
|  | # | ||||||
|  | #   This program is free software; you can redistribute it and/or modify it | ||||||
|  | #   under the terms of the GNU Lesser General Public License as published by | ||||||
|  | #   the Free Software Foundation in version 2.1 of the License. | ||||||
|  | # | ||||||
|  | #   This program is distributed in the hope that it will be useful, but | ||||||
|  | #   WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY | ||||||
|  | #   or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public | ||||||
|  | #   License for more details. | ||||||
|  | # | ||||||
|  | #   You should have received a copy of the GNU Lesser General Public License | ||||||
|  | #   along with this program; if not, write to the Free Software Foundation, | ||||||
|  | #   Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. | ||||||
|  |  | ||||||
|  | /** | ||||||
|  |  * Functions used by all storage backends | ||||||
|  |  * Copyright (C) 2010 Tobias Leupold <tobias.leupold@web.de> | ||||||
|  |  * | ||||||
|  |  * @license LGPL | ||||||
|  |  * @access public | ||||||
|  |  * @package b8 | ||||||
|  |  * @author Tobias Leupold | ||||||
|  |  */ | ||||||
|  |  | ||||||
|  | abstract class b8_storage_base | ||||||
|  | { | ||||||
|  |  | ||||||
|  | 	public $connected            = FALSE; | ||||||
|  |  | ||||||
|  | 	protected $_degenerator      = NULL; | ||||||
|  |  | ||||||
|  | 	const INTERNALS_TEXTS_HAM    = 'bayes*texts.ham'; | ||||||
|  | 	const INTERNALS_TEXTS_SPAM   = 'bayes*texts.spam'; | ||||||
|  | 	const INTERNALS_DBVERSION    = 'bayes*dbversion'; | ||||||
|  |  | ||||||
|  | 	const BACKEND_NOT_CONNECTED  = 'BACKEND_NOT_CONNECTED'; | ||||||
|  | 	const DATABASE_WRONG_VERSION = 'DATABASE_WRONG_VERSION'; | ||||||
|  | 	const DATABASE_NOT_B8        = 'DATABASE_NOT_B8'; | ||||||
|  |  | ||||||
|  | 	/** | ||||||
|  | 	 * Validates the class has all it needs to work. | ||||||
|  | 	 * | ||||||
|  | 	 * @access protected | ||||||
|  | 	 * @return mixed Returns TRUE if everything is okay, otherwise an error code. | ||||||
|  | 	 */ | ||||||
|  |  | ||||||
|  | 	protected function validate() | ||||||
|  | 	{ | ||||||
|  |  | ||||||
|  | 		# We set up the degenerator here, as we would have to duplicate code if it | ||||||
|  | 		# was done in the constructor of the respective storage backend. | ||||||
|  | 		$class = 'b8_degenerator_' . $this->b8_config['degenerator']; | ||||||
|  | 		$this->_degenerator = new $class(); | ||||||
|  |  | ||||||
|  | 		if($this->connected !== TRUE) | ||||||
|  | 			return self::BACKEND_NOT_CONNECTED; | ||||||
|  |  | ||||||
|  | 		return TRUE; | ||||||
|  |  | ||||||
|  | 	} | ||||||
|  |  | ||||||
|  | 	/** | ||||||
|  | 	 * Checks if a b8 database is used and if it's version is okay | ||||||
|  | 	 * | ||||||
|  | 	 * @access protected | ||||||
|  | 	 * @return mixed Returns TRUE if everything is okay, otherwise an error code. | ||||||
|  | 	 */ | ||||||
|  |  | ||||||
|  | 	protected function check_database() | ||||||
|  | 	{ | ||||||
|  |  | ||||||
|  | 		$internals = $this->get_internals(); | ||||||
|  |  | ||||||
|  | 		if(isset($internals['dbversion'])) { | ||||||
|  | 			if($internals['dbversion'] == "2") { | ||||||
|  | 				return TRUE; | ||||||
|  | 			} | ||||||
|  | 			else { | ||||||
|  | 				$this->connected = FALSE; | ||||||
|  | 				return self::DATABASE_WRONG_VERSION; | ||||||
|  | 			} | ||||||
|  | 		} | ||||||
|  | 		else { | ||||||
|  | 			$this->connected = FALSE; | ||||||
|  | 			return self::DATABASE_NOT_B8; | ||||||
|  | 		} | ||||||
|  |  | ||||||
|  | 	} | ||||||
|  |  | ||||||
|  | 	/** | ||||||
|  | 	 * Parses the "count" data of a token. | ||||||
|  | 	 * | ||||||
|  | 	 * @access private | ||||||
|  | 	 * @param string $data | ||||||
|  | 	 * @return array Returns an array of the parsed data: array(count_ham, count_spam, lastseen). | ||||||
|  | 	 */ | ||||||
|  |  | ||||||
|  | 	private function _parse_count($data) | ||||||
|  | 	{ | ||||||
|  |  | ||||||
|  | 		list($count_ham, $count_spam, $lastseen) = explode(' ', $data); | ||||||
|  |  | ||||||
|  | 		$count_ham  = (int) $count_ham; | ||||||
|  | 		$count_spam = (int) $count_spam; | ||||||
|  |  | ||||||
|  | 		return array( | ||||||
|  | 			'count_ham'  => $count_ham, | ||||||
|  | 			'count_spam' => $count_spam | ||||||
|  | 		); | ||||||
|  |  | ||||||
|  | 	} | ||||||
|  |  | ||||||
|  | 	/** | ||||||
|  | 	 * Get the database's internal variables. | ||||||
|  | 	 * | ||||||
|  | 	 * @access public | ||||||
|  | 	 * @return array Returns an array of all internals. | ||||||
|  | 	 */ | ||||||
|  |  | ||||||
|  | 	public function get_internals() | ||||||
|  | 	{ | ||||||
|  |  | ||||||
|  | 		$internals = $this->_get_query( | ||||||
|  | 			array( | ||||||
|  | 				self::INTERNALS_TEXTS_HAM, | ||||||
|  | 				self::INTERNALS_TEXTS_SPAM, | ||||||
|  | 				self::INTERNALS_DBVERSION | ||||||
|  | 			) | ||||||
|  | 		); | ||||||
|  |  | ||||||
|  | 		return array( | ||||||
|  | 			'texts_ham'  => (int) $internals[self::INTERNALS_TEXTS_HAM], | ||||||
|  | 			'texts_spam' => (int) $internals[self::INTERNALS_TEXTS_SPAM], | ||||||
|  | 			'dbversion'  => (int) $internals[self::INTERNALS_DBVERSION] | ||||||
|  | 		); | ||||||
|  |  | ||||||
|  | 	} | ||||||
|  |  | ||||||
|  | 	/** | ||||||
|  | 	 * Get all data about a list of tags from the database. | ||||||
|  | 	 * | ||||||
|  | 	 * @access public | ||||||
|  | 	 * @param array $tokens | ||||||
|  | 	 * @return mixed Returns FALSE on failure, otherwise returns array of returned data in the format array('tokens' => array(token => count), 'degenerates' => array(token => array(degenerate => count))). | ||||||
|  | 	 */ | ||||||
|  |  | ||||||
|  | 	public function get($tokens) | ||||||
|  | 	{ | ||||||
|  |  | ||||||
|  | 		# Validate the startup | ||||||
|  |  | ||||||
|  | 		$started_up = $this->validate(); | ||||||
|  |  | ||||||
|  | 		if($started_up !== TRUE) | ||||||
|  | 			return $started_up; | ||||||
|  |  | ||||||
|  | 		# First we see what we have in the database. | ||||||
|  | 		$token_data = $this->_get_query($tokens); | ||||||
|  |  | ||||||
|  | 		# Check if we have to degenerate some tokens | ||||||
|  |  | ||||||
|  | 		$missing_tokens = array(); | ||||||
|  |  | ||||||
|  | 		foreach($tokens as $token) { | ||||||
|  | 			if(!isset($token_data[$token])) | ||||||
|  | 				$missing_tokens[] = $token; | ||||||
|  | 		} | ||||||
|  |  | ||||||
|  | 		if(count($missing_tokens) > 0) { | ||||||
|  |  | ||||||
|  | 			# We have to degenerate some tokens | ||||||
|  | 			$degenerates_list = array(); | ||||||
|  |  | ||||||
|  | 			# Generate a list of degenerated tokens for the missing tokens ... | ||||||
|  | 			$degenerates = $this->_degenerator->degenerate($missing_tokens); | ||||||
|  |  | ||||||
|  | 			# ... and look them up | ||||||
|  |  | ||||||
|  | 			foreach($degenerates as $token => $token_degenerates) | ||||||
|  | 				$degenerates_list = array_merge($degenerates_list, $token_degenerates); | ||||||
|  |  | ||||||
|  | 			$token_data = array_merge($token_data, $this->_get_query($degenerates_list)); | ||||||
|  |  | ||||||
|  | 		} | ||||||
|  |  | ||||||
|  | 		# Here, we have all availible data in $token_data. | ||||||
|  |  | ||||||
|  | 		$return_data_tokens = array(); | ||||||
|  | 		$return_data_degenerates = array(); | ||||||
|  |  | ||||||
|  | 		foreach($tokens as $token) { | ||||||
|  |  | ||||||
|  | 			if(isset($token_data[$token]) === TRUE) { | ||||||
|  |  | ||||||
|  | 				# The token was found in the database | ||||||
|  |  | ||||||
|  | 				# Add the data ... | ||||||
|  | 				$return_data_tokens[$token] = $this->_parse_count($token_data[$token]); | ||||||
|  |  | ||||||
|  | 				# ... and update it's lastseen parameter | ||||||
|  | 				$this->_update($token, "{$return_data_tokens[$token]['count_ham']} {$return_data_tokens[$token]['count_spam']} " . $this->b8_config['today']); | ||||||
|  |  | ||||||
|  | 			} | ||||||
|  |  | ||||||
|  | 			else { | ||||||
|  |  | ||||||
|  | 				# The token was not found, so we look if we | ||||||
|  | 				# can return data for degenerated tokens | ||||||
|  |  | ||||||
|  | 				# Check all degenerated forms of the token | ||||||
|  |  | ||||||
|  | 				foreach($this->_degenerator->degenerates[$token] as $degenerate) { | ||||||
|  |  | ||||||
|  | 					if(isset($token_data[$degenerate]) === TRUE) { | ||||||
|  |  | ||||||
|  | 						# A degeneration of the token way found in the database | ||||||
|  |  | ||||||
|  | 						# Add the data ... | ||||||
|  | 						$return_data_degenerates[$token][$degenerate] = $this->_parse_count($token_data[$degenerate]); | ||||||
|  |  | ||||||
|  | 						# ... and update it's lastseen parameter | ||||||
|  | 						$this->_update($degenerate, "{$return_data_degenerates[$token][$degenerate]['count_ham']} {$return_data_degenerates[$token][$degenerate]['count_spam']} " . $this->b8_config['today']); | ||||||
|  |  | ||||||
|  | 					} | ||||||
|  |  | ||||||
|  | 				} | ||||||
|  |  | ||||||
|  | 			} | ||||||
|  |  | ||||||
|  | 		} | ||||||
|  |  | ||||||
|  | 		# Now, all token data directly found in the database is in $return_data_tokens | ||||||
|  | 		# and all data for degenerated versions is in $return_data_degenerates | ||||||
|  |  | ||||||
|  | 		# First, we commit the changes to the lastseen parameters | ||||||
|  | 		$this->_commit(); | ||||||
|  |  | ||||||
|  | 		# Then, we return what we have | ||||||
|  | 		return array( | ||||||
|  | 			'tokens'      => $return_data_tokens, | ||||||
|  | 			'degenerates' => $return_data_degenerates | ||||||
|  | 		); | ||||||
|  |  | ||||||
|  | 	} | ||||||
|  |  | ||||||
|  | 	/** | ||||||
|  | 	 * Stores or deletes a list of tokens from the given category. | ||||||
|  | 	 * | ||||||
|  | 	 * @access public | ||||||
|  | 	 * @param array $tokens | ||||||
|  | 	 * @param const $category Either b8::HAM or b8::SPAM | ||||||
|  | 	 * @param const $action Either b8::LEARN or b8::UNLEARN | ||||||
|  | 	 * @return void | ||||||
|  | 	 */ | ||||||
|  |  | ||||||
|  | 	public function process_text($tokens, $category, $action) | ||||||
|  | 	{ | ||||||
|  |  | ||||||
|  | 		# Validate the startup | ||||||
|  |  | ||||||
|  | 		$started_up = $this->validate(); | ||||||
|  |  | ||||||
|  | 		if($started_up !== TRUE) | ||||||
|  | 			return $started_up; | ||||||
|  |  | ||||||
|  | 		# No matter what we do, we first have to check what data we have. | ||||||
|  |  | ||||||
|  | 		# First get the internals, including the ham texts and spam texts counter | ||||||
|  | 		$internals = $this->get_internals(); | ||||||
|  |  | ||||||
|  | 		# Then, fetch all data for all tokens we have (and update their lastseen parameters) | ||||||
|  | 		$token_data = $this->_get_query(array_keys($tokens)); | ||||||
|  |  | ||||||
|  | 		# Process all tokens to learn/unlearn | ||||||
|  |  | ||||||
|  | 		foreach($tokens as $token => $count) { | ||||||
|  |  | ||||||
|  | 			if(isset($token_data[$token])) { | ||||||
|  |  | ||||||
|  | 				# We already have this token, so update it's data | ||||||
|  |  | ||||||
|  | 				# Get the existing data | ||||||
|  | 				list($count_ham, $count_spam, $lastseen) = explode(' ', $token_data[$token]); | ||||||
|  | 				$count_ham  = (int) $count_ham; | ||||||
|  | 				$count_spam = (int) $count_spam; | ||||||
|  |  | ||||||
|  | 				# Increase or decrease the right counter | ||||||
|  |  | ||||||
|  | 				if($action === b8::LEARN) { | ||||||
|  | 					if($category === b8::HAM) | ||||||
|  | 						$count_ham += $count; | ||||||
|  | 					elseif($category === b8::SPAM) | ||||||
|  | 						$count_spam += $count; | ||||||
|  | 				} | ||||||
|  |  | ||||||
|  | 				elseif($action == b8::UNLEARN) { | ||||||
|  | 					if($category === b8::HAM) | ||||||
|  | 						$count_ham -= $count; | ||||||
|  | 					elseif($category === b8::SPAM) | ||||||
|  | 						$count_spam -= $count; | ||||||
|  | 				} | ||||||
|  |  | ||||||
|  | 				# We don't want to have negative values | ||||||
|  |  | ||||||
|  | 				if($count_ham < 0) | ||||||
|  | 					$count_ham = 0; | ||||||
|  |  | ||||||
|  | 				if($count_spam < 0) | ||||||
|  | 					$count_spam = 0; | ||||||
|  |  | ||||||
|  | 				# Now let's see if we have to update or delete the token | ||||||
|  | 				if($count_ham !== 0 or $count_spam !== 0) | ||||||
|  | 					$this->_update($token, "$count_ham $count_spam " . $this->b8_config['today']); | ||||||
|  | 				else | ||||||
|  | 					$this->_del($token); | ||||||
|  |  | ||||||
|  | 			} | ||||||
|  |  | ||||||
|  | 			else { | ||||||
|  |  | ||||||
|  | 				# We don't have the token. If we unlearn a text, we can't delete it | ||||||
|  | 				# as we don't have it anyway, so just do something if we learn a text | ||||||
|  |  | ||||||
|  | 				if($action === b8::LEARN) { | ||||||
|  |  | ||||||
|  | 					if($category === b8::HAM) | ||||||
|  | 						$data = '1 0 '; | ||||||
|  | 					elseif($category === b8::SPAM) | ||||||
|  | 						$data = '0 1 '; | ||||||
|  |  | ||||||
|  | 					$data .= $this->b8_config['today']; | ||||||
|  |  | ||||||
|  | 					$this->_put($token, $data); | ||||||
|  |  | ||||||
|  | 				} | ||||||
|  |  | ||||||
|  | 			} | ||||||
|  |  | ||||||
|  | 		} | ||||||
|  |  | ||||||
|  | 		# Now, all token have been processed, so let's update the right text | ||||||
|  |  | ||||||
|  | 		if($action === b8::LEARN) { | ||||||
|  |  | ||||||
|  | 			if($category === b8::HAM) { | ||||||
|  | 				$internals['texts_ham']++; | ||||||
|  | 				$this->_update(self::INTERNALS_TEXTS_HAM, $internals['texts_ham']); | ||||||
|  | 			} | ||||||
|  |  | ||||||
|  | 			elseif($category === b8::SPAM) { | ||||||
|  | 				$internals['texts_spam']++; | ||||||
|  | 				$this->_update(self::INTERNALS_TEXTS_SPAM, $internals['texts_spam']); | ||||||
|  | 			} | ||||||
|  |  | ||||||
|  | 		} | ||||||
|  |  | ||||||
|  | 		elseif($action == b8::UNLEARN) { | ||||||
|  |  | ||||||
|  | 			if($category === b8::HAM) { | ||||||
|  |  | ||||||
|  | 				$internals['texts_ham']--; | ||||||
|  |  | ||||||
|  | 				if($internals['texts_ham'] < 0) | ||||||
|  | 					$internals['texts_ham'] = 0; | ||||||
|  |  | ||||||
|  | 				$this->_update(self::INTERNALS_TEXTS_HAM, $internals['texts_ham']); | ||||||
|  |  | ||||||
|  | 			} | ||||||
|  |  | ||||||
|  | 			elseif($category === b8::SPAM) { | ||||||
|  |  | ||||||
|  | 				$internals['texts_spam']--; | ||||||
|  |  | ||||||
|  | 				if($internals['texts_spam'] < 0) | ||||||
|  | 					$internals['texts_spam'] = 0; | ||||||
|  |  | ||||||
|  | 				$this->_update(self::INTERNALS_TEXTS_SPAM, $internals['texts_spam']); | ||||||
|  |  | ||||||
|  | 			} | ||||||
|  |  | ||||||
|  | 		} | ||||||
|  |  | ||||||
|  | 		# We're done and can commit all changes to the database now | ||||||
|  | 		$this->_commit(); | ||||||
|  |  | ||||||
|  | 	} | ||||||
|  |  | ||||||
|  | } | ||||||
|  |  | ||||||
|  | ?> | ||||||
							
								
								
									
										351
									
								
								library/spam/b8/storage/storage_frndc.php
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										351
									
								
								library/spam/b8/storage/storage_frndc.php
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,351 @@ | |||||||
|  | <?php | ||||||
|  |  | ||||||
|  | #   Copyright (C) 2006-2011 Tobias Leupold <tobias.leupold@web.de> | ||||||
|  | # | ||||||
|  | #   This file is part of the b8 package | ||||||
|  | # | ||||||
|  | #   This program is free software; you can redistribute it and/or modify it | ||||||
|  | #   under the terms of the GNU Lesser General Public License as published by | ||||||
|  | #   the Free Software Foundation in version 2.1 of the License. | ||||||
|  | # | ||||||
|  | #   This program is distributed in the hope that it will be useful, but | ||||||
|  | #   WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY | ||||||
|  | #   or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public | ||||||
|  | #   License for more details. | ||||||
|  | # | ||||||
|  | #   You should have received a copy of the GNU Lesser General Public License | ||||||
|  | #   along with this program; if not, write to the Free Software Foundation, | ||||||
|  | #   Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. | ||||||
|  |  | ||||||
|  | /** | ||||||
|  |  * The MySQL abstraction layer for communicating with the database. | ||||||
|  |  * Copyright (C) 2009 Oliver Lillie (aka buggedcom) | ||||||
|  |  * Copyright (C) 2010-2011 Tobias Leupold <tobias.leupold@web.de> | ||||||
|  |  * | ||||||
|  |  * @license LGPL | ||||||
|  |  * @access public | ||||||
|  |  * @package b8 | ||||||
|  |  * @author Oliver Lillie (aka buggedcom) (original PHP 5 port and optimizations) | ||||||
|  |  * @author Tobias Leupold | ||||||
|  |  */ | ||||||
|  |  | ||||||
|  | class b8_storage_mysql extends b8_storage_base | ||||||
|  | { | ||||||
|  |  | ||||||
|  | 	public $config = array( | ||||||
|  | 		'database'        => 'b8_wordlist', | ||||||
|  | 		'table_name'      => 'b8_wordlist', | ||||||
|  | 		'host'            => 'localhost', | ||||||
|  | 		'user'            => FALSE, | ||||||
|  | 		'pass'            => FALSE, | ||||||
|  | 		'connection'      => NULL | ||||||
|  | 	); | ||||||
|  |  | ||||||
|  | 	public $b8_config = array( | ||||||
|  | 		'degenerator'     => NULL, | ||||||
|  | 		'today'           => NULL | ||||||
|  | 	); | ||||||
|  |  | ||||||
|  | 	private $_connection                   = NULL; | ||||||
|  | 	private $_deletes                      = array(); | ||||||
|  | 	private $_puts                         = array(); | ||||||
|  | 	private $_updates                      = array(); | ||||||
|  |  | ||||||
|  | 	const DATABASE_CONNECTION_FAIL         = 'DATABASE_CONNECTION_FAIL'; | ||||||
|  | 	const DATABASE_CONNECTION_ERROR        = 'DATABASE_CONNECTION_ERROR'; | ||||||
|  | 	const DATABASE_CONNECTION_BAD_RESOURCE = 'DATABASE_CONNECTION_BAD_RESOURCE'; | ||||||
|  | 	const DATABASE_SELECT_ERROR            = 'DATABASE_SELECT_ERROR'; | ||||||
|  | 	const DATABASE_TABLE_ACCESS_FAIL       = 'DATABASE_TABLE_ACCESS_FAIL'; | ||||||
|  | 	const DATABASE_WRONG_VERSION           = 'DATABASE_WRONG_VERSION'; | ||||||
|  |  | ||||||
|  | 	/** | ||||||
|  | 	 * Constructs the database layer. | ||||||
|  | 	 * | ||||||
|  | 	 * @access public | ||||||
|  | 	 * @param string $config | ||||||
|  | 	 */ | ||||||
|  |  | ||||||
|  | 	function __construct($config, $degenerator, $today) | ||||||
|  | 	{ | ||||||
|  |  | ||||||
|  | 		# Pass some variables of the main b8 config to this class | ||||||
|  | 		$this->b8_config['degenerator'] = $degenerator; | ||||||
|  | 		$this->b8_config['today']       = $today; | ||||||
|  |  | ||||||
|  | 		# Validate the config items | ||||||
|  |  | ||||||
|  | 		if(count($config) > 0) { | ||||||
|  |  | ||||||
|  | 			foreach ($config as $name => $value) { | ||||||
|  |  | ||||||
|  | 				switch($name) { | ||||||
|  |  | ||||||
|  | 					case 'table_name': | ||||||
|  | 					case 'host': | ||||||
|  | 					case 'user': | ||||||
|  | 					case 'pass': | ||||||
|  | 					case 'database': | ||||||
|  | 						$this->config[$name] = (string) $value; | ||||||
|  | 						break; | ||||||
|  |  | ||||||
|  | 					case 'connection': | ||||||
|  |  | ||||||
|  | 						if($value !== NULL) { | ||||||
|  |  | ||||||
|  | 							if(is_resource($value) === TRUE) { | ||||||
|  | 								$resource_type = get_resource_type($value); | ||||||
|  | 								$this->config['connection'] = $resource_type !== 'mysql link' && $resource_type !== 'mysql link persistent' ? FALSE : $value; | ||||||
|  | 							} | ||||||
|  |  | ||||||
|  | 							else | ||||||
|  | 								$this->config['connection'] = FALSE; | ||||||
|  |  | ||||||
|  | 						} | ||||||
|  |  | ||||||
|  | 						break; | ||||||
|  |  | ||||||
|  | 				} | ||||||
|  |  | ||||||
|  | 			} | ||||||
|  |  | ||||||
|  | 		} | ||||||
|  |  | ||||||
|  | 	} | ||||||
|  |  | ||||||
|  | 	/** | ||||||
|  | 	 * Closes the database connection. | ||||||
|  | 	 * | ||||||
|  | 	 * @access public | ||||||
|  | 	 * @return void | ||||||
|  | 	 */ | ||||||
|  |  | ||||||
|  | 	function __destruct() | ||||||
|  | 	{ | ||||||
|  |  | ||||||
|  | 		if($this->_connection === NULL) | ||||||
|  | 			return; | ||||||
|  |  | ||||||
|  | 		# Commit any changes before closing | ||||||
|  | 		$this->_commit(); | ||||||
|  |  | ||||||
|  | 		# Just close the connection if no link-resource was passed and b8 created it's own connection | ||||||
|  | 		if($this->config['connection'] === NULL) | ||||||
|  | 			mysql_close($this->_connection); | ||||||
|  |  | ||||||
|  | 		$this->connected = FALSE; | ||||||
|  |  | ||||||
|  | 	} | ||||||
|  |  | ||||||
|  | 	/** | ||||||
|  | 	 * Connect to the database and do some checks. | ||||||
|  | 	 * | ||||||
|  | 	 * @access public | ||||||
|  | 	 * @return mixed Returns TRUE on a successful database connection, otherwise returns a constant from b8. | ||||||
|  | 	 */ | ||||||
|  |  | ||||||
|  | 	public function connect() | ||||||
|  | 	{ | ||||||
|  |  | ||||||
|  | 		# Are we already connected? | ||||||
|  | 		if($this->connected === TRUE) | ||||||
|  | 			return TRUE; | ||||||
|  |  | ||||||
|  | 		# Are we using an existing passed resource? | ||||||
|  | 		if($this->config['connection'] === FALSE) { | ||||||
|  | 			# ... yes we are, but the connection is not a resource, so return an error | ||||||
|  | 			$this->connected = FALSE; | ||||||
|  | 			return self::DATABASE_CONNECTION_BAD_RESOURCE; | ||||||
|  | 		} | ||||||
|  |  | ||||||
|  | 		elseif($this->config['connection'] === NULL) { | ||||||
|  |  | ||||||
|  | 			# ... no we aren't so we have to connect. | ||||||
|  |  | ||||||
|  | 			if($this->_connection = mysql_connect($this->config['host'], $this->config['user'], $this->config['pass'])) { | ||||||
|  | 				if(mysql_select_db($this->config['database'], $this->_connection) === FALSE) { | ||||||
|  | 					$this->connected = FALSE; | ||||||
|  | 					return self::DATABASE_SELECT_ERROR . ": " . mysql_error(); | ||||||
|  | 				} | ||||||
|  | 			} | ||||||
|  | 			else { | ||||||
|  | 				$this->connected = FALSE; | ||||||
|  | 				return self::DATABASE_CONNECTION_ERROR; | ||||||
|  | 			} | ||||||
|  |  | ||||||
|  | 		} | ||||||
|  |  | ||||||
|  | 		else { | ||||||
|  | 			# ... yes we are | ||||||
|  | 			$this->_connection = $this->config['connection']; | ||||||
|  | 		} | ||||||
|  |  | ||||||
|  | 		# Just in case ... | ||||||
|  | 		if($this->_connection === NULL) { | ||||||
|  | 			$this->connected = FALSE; | ||||||
|  | 			return self::DATABASE_CONNECTION_FAIL; | ||||||
|  | 		} | ||||||
|  |  | ||||||
|  | 		# Check to see if the wordlist table exists | ||||||
|  | 		if(mysql_query('DESCRIBE ' . $this->config['table_name'], $this->_connection) === FALSE) { | ||||||
|  | 			$this->connected = FALSE; | ||||||
|  | 			return self::DATABASE_TABLE_ACCESS_FAIL . ": " . mysql_error(); | ||||||
|  | 		} | ||||||
|  |  | ||||||
|  | 		# Everything is okay and connected | ||||||
|  | 		$this->connected = TRUE; | ||||||
|  |  | ||||||
|  | 		# Let's see if this is a b8 database and the version is okay | ||||||
|  | 		return $this->check_database(); | ||||||
|  |  | ||||||
|  | 	} | ||||||
|  |  | ||||||
|  | 	/** | ||||||
|  | 	 * Does the actual interaction with the database when fetching data. | ||||||
|  | 	 * | ||||||
|  | 	 * @access protected | ||||||
|  | 	 * @param array $tokens | ||||||
|  | 	 * @return mixed Returns an array of the returned data in the format array(token => data) or an empty array if there was no data. | ||||||
|  | 	 */ | ||||||
|  |  | ||||||
|  | 	protected function _get_query($tokens) | ||||||
|  | 	{ | ||||||
|  |  | ||||||
|  | 		# Construct the query ... | ||||||
|  |  | ||||||
|  | 		if(count($tokens) > 0) { | ||||||
|  |  | ||||||
|  | 			$where = array(); | ||||||
|  |  | ||||||
|  | 			foreach ($tokens as $token) { | ||||||
|  | 				$token = mysql_real_escape_string($token, $this->_connection); | ||||||
|  | 				array_push($where, $token); | ||||||
|  | 			} | ||||||
|  |  | ||||||
|  | 			$where = 'token IN ("' . implode('", "', $where) . '")'; | ||||||
|  | 		} | ||||||
|  |  | ||||||
|  | 		else { | ||||||
|  | 			$token = mysql_real_escape_string($token, $this->_connection); | ||||||
|  | 			$where = 'token = "' . $token . '"'; | ||||||
|  | 		} | ||||||
|  |  | ||||||
|  | 		# ... and fetch the data | ||||||
|  |  | ||||||
|  | 		$result = mysql_query(' | ||||||
|  | 			SELECT token, count | ||||||
|  | 			FROM ' . $this->config['table_name'] . ' | ||||||
|  | 			WHERE ' . $where . '; | ||||||
|  | 		', $this->_connection); | ||||||
|  |  | ||||||
|  | 		$data = array(); | ||||||
|  |  | ||||||
|  | 		while ($row = mysql_fetch_array($result, MYSQL_ASSOC)) | ||||||
|  | 			$data[$row['token']] = $row['count']; | ||||||
|  |  | ||||||
|  | 		mysql_free_result($result); | ||||||
|  |  | ||||||
|  | 		return $data; | ||||||
|  |  | ||||||
|  | 	} | ||||||
|  |  | ||||||
|  | 	/** | ||||||
|  | 	 * Store a token to the database. | ||||||
|  | 	 * | ||||||
|  | 	 * @access protected | ||||||
|  | 	 * @param string $token | ||||||
|  | 	 * @param string $count | ||||||
|  | 	 * @return void | ||||||
|  | 	 */ | ||||||
|  |  | ||||||
|  | 	protected function _put($token, $count) { | ||||||
|  | 		$token = mysql_real_escape_string($token, $this->_connection); | ||||||
|  | 		$count = mysql_real_escape_string($count, $this->_connection);; | ||||||
|  | 		array_push($this->_puts, '("' . $token . '", "' . $count . '")'); | ||||||
|  | 	} | ||||||
|  |  | ||||||
|  | 	/** | ||||||
|  | 	 * Update an existing token. | ||||||
|  | 	 * | ||||||
|  | 	 * @access protected | ||||||
|  | 	 * @param string $token | ||||||
|  | 	 * @param string $count | ||||||
|  | 	 * @return void | ||||||
|  | 	 */ | ||||||
|  |  | ||||||
|  | 	protected function _update($token, $count) | ||||||
|  | 	{ | ||||||
|  | 		$token = mysql_real_escape_string($token, $this->_connection); | ||||||
|  | 		$count = mysql_real_escape_string($count, $this->_connection); | ||||||
|  | 		array_push($this->_updates, '("' . $token . '", "' . $count . '")'); | ||||||
|  | 	} | ||||||
|  |  | ||||||
|  | 	/** | ||||||
|  | 	 * Remove a token from the database. | ||||||
|  | 	 * | ||||||
|  | 	 * @access protected | ||||||
|  | 	 * @param string $token | ||||||
|  | 	 * @return void | ||||||
|  | 	 */ | ||||||
|  |  | ||||||
|  | 	protected function _del($token) | ||||||
|  | 	{ | ||||||
|  | 		$token = mysql_real_escape_string($token, $this->_connection); | ||||||
|  | 		array_push($this->_deletes, $token); | ||||||
|  | 	} | ||||||
|  |  | ||||||
|  | 	/** | ||||||
|  | 	 * Commits any modification queries. | ||||||
|  | 	 * | ||||||
|  | 	 * @access protected | ||||||
|  | 	 * @return void | ||||||
|  | 	 */ | ||||||
|  |  | ||||||
|  | 	protected function _commit() | ||||||
|  | 	{ | ||||||
|  |  | ||||||
|  | 		if(count($this->_deletes) > 0) { | ||||||
|  |  | ||||||
|  | 			$result = mysql_query(' | ||||||
|  | 				DELETE FROM ' . $this->config['table_name'] . ' | ||||||
|  | 				WHERE token IN ("' . implode('", "', $this->_deletes) . '"); | ||||||
|  | 			', $this->_connection); | ||||||
|  |  | ||||||
|  | 			if(is_resource($result) === TRUE) | ||||||
|  | 				mysql_free_result($result); | ||||||
|  |  | ||||||
|  | 			$this->_deletes = array(); | ||||||
|  |  | ||||||
|  | 		} | ||||||
|  |  | ||||||
|  | 		if(count($this->_puts) > 0) { | ||||||
|  |  | ||||||
|  | 			$result = mysql_query(' | ||||||
|  | 				INSERT INTO ' . $this->config['table_name'] . '(token, count) | ||||||
|  | 				VALUES ' . implode(', ', $this->_puts) . ';', $this->_connection); | ||||||
|  |  | ||||||
|  | 			if(is_resource($result) === TRUE) | ||||||
|  | 				mysql_free_result($result); | ||||||
|  |  | ||||||
|  | 			$this->_puts = array(); | ||||||
|  |  | ||||||
|  | 		} | ||||||
|  |  | ||||||
|  | 		if(count($this->_updates) > 0) { | ||||||
|  |  | ||||||
|  | 			$result = mysql_query(' | ||||||
|  | 				INSERT INTO ' . $this->config['table_name'] . '(token, count) | ||||||
|  | 				VALUES ' . implode(', ', $this->_updates) . ' | ||||||
|  | 				ON DUPLICATE KEY UPDATE ' . $this->config['table_name'] . '.count = VALUES(count);', $this->_connection); | ||||||
|  |  | ||||||
|  | 			if(is_resource($result) === TRUE) | ||||||
|  | 				mysql_free_result($result); | ||||||
|  |  | ||||||
|  | 			$this->_updates = array(); | ||||||
|  |  | ||||||
|  | 		} | ||||||
|  |  | ||||||
|  | 	} | ||||||
|  |  | ||||||
|  | } | ||||||
|  |  | ||||||
|  | ?> | ||||||
		Reference in New Issue
	
	Block a user