350 lines
		
	
	
		
			9.2 KiB
		
	
	
	
		
			PHP
		
	
	
	
	
	
			
		
		
	
	
			350 lines
		
	
	
		
			9.2 KiB
		
	
	
	
		
			PHP
		
	
	
	
	
	
| <?php
 | |
| 
 | |
| /**
 | |
|  * This class represents a text sample to be parsed.
 | |
|  *
 | |
|  * @category    Text
 | |
|  * @package     Text_LanguageDetect
 | |
|  * @author      Nicholas Pisarro
 | |
|  * @copyright   2006
 | |
|  * @license     BSD
 | |
|  * @version     CVS: $Id: Parser.php 322327 2012-01-15 17:55:59Z cweiske $
 | |
|  * @link        http://pear.php.net/package/Text_LanguageDetect/
 | |
|  * @link        http://langdetect.blogspot.com/
 | |
|  */
 | |
| 
 | |
| /**
 | |
|  * This class represents a text sample to be parsed.
 | |
|  *
 | |
|  * This separates the analysis of a text sample from the primary LanguageDetect
 | |
|  * class. After a new profile has been built, the data can be retrieved using
 | |
|  * the accessor functions.
 | |
|  *
 | |
|  * This class is intended to be used by the Text_LanguageDetect class, not 
 | |
|  * end-users.
 | |
|  *
 | |
|  * @category    Text
 | |
|  * @package     Text_LanguageDetect
 | |
|  * @author      Nicholas Pisarro
 | |
|  * @copyright   2006
 | |
|  * @license     BSD
 | |
|  * @version     release: 0.3.0
 | |
|  */
 | |
| class Text_LanguageDetect_Parser extends Text_LanguageDetect
 | |
| {
 | |
|     /**
 | |
|      * the piece of text being parsed
 | |
|      *
 | |
|      * @access  private
 | |
|      * @var     string
 | |
|      */
 | |
|     var $_string;
 | |
| 
 | |
|     /**
 | |
|      * stores the trigram frequencies of the sample
 | |
|      *
 | |
|      * @access  private
 | |
|      * @var     string
 | |
|      */
 | |
|     var $_trigrams = array();
 | |
| 
 | |
|     /**
 | |
|      * stores the trigram ranks of the sample
 | |
|      *
 | |
|      * @access  private
 | |
|      * @var     array
 | |
|      */
 | |
|     var $_trigram_ranks = array();
 | |
| 
 | |
|     /**
 | |
|      * stores the unicode blocks of the sample
 | |
|      *
 | |
|      * @access  private
 | |
|      * @var     array
 | |
|      */
 | |
|     var $_unicode_blocks = array();
 | |
|     
 | |
|     /**
 | |
|      * Whether the parser should compile the unicode ranges
 | |
|      * 
 | |
|      * @access  private
 | |
|      * @var     bool
 | |
|      */
 | |
|     var $_compile_unicode = false;
 | |
| 
 | |
|     /**
 | |
|      * Whether the parser should compile trigrams
 | |
|      *
 | |
|      * @access  private
 | |
|      * @var     bool
 | |
|      */
 | |
|     var $_compile_trigram = false;
 | |
| 
 | |
|     /**
 | |
|      * Whether the trigram parser should pad the beginning of the string
 | |
|      *
 | |
|      * @access  private
 | |
|      * @var     bool
 | |
|      */
 | |
|     var $_trigram_pad_start = false;
 | |
| 
 | |
|     /**
 | |
|      * Whether the unicode parser should skip non-alphabetical ascii chars
 | |
|      *
 | |
|      * @access  private
 | |
|      * @var     bool
 | |
|      */
 | |
|     var $_unicode_skip_symbols = true;
 | |
| 
 | |
|     /**
 | |
|      * Constructor
 | |
|      *
 | |
|      * @access  private
 | |
|      * @param   string  $string     string to be parsed
 | |
|      */
 | |
|     function Text_LanguageDetect_Parser($string) {
 | |
|         $this->_string = $string;
 | |
|     }
 | |
| 
 | |
|     /**
 | |
|      * Returns true if a string is suitable for parsing
 | |
|      *
 | |
|      * @param   string  $str    input string to test
 | |
|      * @return  bool            true if acceptable, false if not
 | |
|      */
 | |
|     public static function validateString($str) {
 | |
|         if (!empty($str) && strlen($str) > 3 && preg_match('/\S/', $str)) {
 | |
|             return true;
 | |
|         } else {
 | |
|             return false;
 | |
|         }
 | |
|     }
 | |
| 
 | |
|     /**
 | |
|      * turn on/off trigram counting
 | |
|      *
 | |
|      * @access  public
 | |
|      * @param   bool    $bool true for on, false for off
 | |
|      */
 | |
|     function prepareTrigram($bool = true)
 | |
|     {
 | |
|         $this->_compile_trigram = $bool;
 | |
|     }
 | |
| 
 | |
|     /**
 | |
|      * turn on/off unicode block counting
 | |
|      *
 | |
|      * @access  public
 | |
|      * @param   bool    $bool true for on, false for off
 | |
|      */
 | |
|     function prepareUnicode($bool = true)
 | |
|     {
 | |
|         $this->_compile_unicode = $bool;
 | |
|     }
 | |
| 
 | |
|     /**
 | |
|      * turn on/off padding the beginning of the sample string
 | |
|      *
 | |
|      * @access  public
 | |
|      * @param   bool    $bool true for on, false for off
 | |
|      */
 | |
|     function setPadStart($bool = true)
 | |
|     {
 | |
|         $this->_trigram_pad_start = $bool;
 | |
|     }
 | |
| 
 | |
|     /**
 | |
|      * Should the unicode block counter skip non-alphabetical ascii chars?
 | |
|      *
 | |
|      * @access  public
 | |
|      * @param   bool    $bool true for on, false for off
 | |
|      */
 | |
|     function setUnicodeSkipSymbols($bool = true)
 | |
|     {
 | |
|         $this->_unicode_skip_symbols = $bool;
 | |
|     }
 | |
| 
 | |
|     /**
 | |
|      * Returns the trigram ranks for the text sample
 | |
|      *
 | |
|      * @access  public
 | |
|      * @return  array    trigram ranks in the text sample
 | |
|      */
 | |
|     function &getTrigramRanks()
 | |
|     {
 | |
|         return $this->_trigram_ranks;
 | |
|     }
 | |
| 
 | |
|     /**
 | |
|      * Return the trigram freqency table
 | |
|      *
 | |
|      * only used in testing to make sure the parser is working
 | |
|      *
 | |
|      * @access  public
 | |
|      * @return  array    trigram freqencies in the text sample
 | |
|      */
 | |
|     function &getTrigramFreqs()
 | |
|     {
 | |
|         return $this->_trigram;
 | |
|     }
 | |
| 
 | |
|     /**
 | |
|      * returns the array of unicode blocks
 | |
|      *
 | |
|      * @access  public
 | |
|      * @return  array   unicode blocks in the text sample
 | |
|      */
 | |
|     function &getUnicodeBlocks()
 | |
|     {
 | |
|         return $this->_unicode_blocks;
 | |
|     }
 | |
| 
 | |
|     /**
 | |
|      * Executes the parsing operation
 | |
|      * 
 | |
|      * Be sure to call the set*() functions to set options and the 
 | |
|      * prepare*() functions first to tell it what kind of data to compute
 | |
|      *
 | |
|      * Afterwards the get*() functions can be used to access the compiled
 | |
|      * information.
 | |
|      *
 | |
|      * @access public
 | |
|      */
 | |
|     function analyze()
 | |
|     {
 | |
|         $len = strlen($this->_string);
 | |
|         $byte_counter = 0;
 | |
| 
 | |
| 
 | |
|         // unicode startup
 | |
|         if ($this->_compile_unicode) {
 | |
|             $blocks = $this->_read_unicode_block_db();
 | |
|             $block_count = count($blocks);
 | |
| 
 | |
|             $skipped_count = 0;
 | |
|             $unicode_chars = array();
 | |
|         }
 | |
| 
 | |
|         // trigram startup
 | |
|         if ($this->_compile_trigram) {
 | |
|             // initialize them as blank so the parser will skip the first two
 | |
|             // (since it skips trigrams with more than  2 contiguous spaces)
 | |
|             $a = ' ';
 | |
|             $b = ' ';
 | |
| 
 | |
|             // kludge
 | |
|             // if it finds a valid trigram to start and the start pad option is
 | |
|             // off, then set a variable that will be used to reduce this
 | |
|             // trigram after parsing has finished
 | |
|             if (!$this->_trigram_pad_start) {
 | |
|                 $a = $this->_next_char($this->_string, $byte_counter, true);
 | |
| 
 | |
|                 if ($a != ' ') {
 | |
|                     $b = $this->_next_char($this->_string, $byte_counter, true);
 | |
|                     $dropone = " $a$b";
 | |
|                 }
 | |
| 
 | |
|                 $byte_counter = 0;
 | |
|                 $a = ' ';
 | |
|                 $b = ' ';
 | |
|             }
 | |
|         }
 | |
| 
 | |
|         while ($byte_counter < $len) {
 | |
|             $char = $this->_next_char($this->_string, $byte_counter, true);
 | |
| 
 | |
| 
 | |
|             // language trigram detection
 | |
|             if ($this->_compile_trigram) {
 | |
|                 if (!($b == ' ' && ($a == ' ' || $char == ' '))) {
 | |
|                     if (!isset($this->_trigram[$a . $b . $char])) {
 | |
|                        $this->_trigram[$a . $b . $char] = 1;
 | |
|                     } else {
 | |
|                        $this->_trigram[$a . $b . $char]++;
 | |
|                     }
 | |
|                 }
 | |
| 
 | |
|                 $a = $b;
 | |
|                 $b = $char;
 | |
|             }
 | |
| 
 | |
|             // unicode block detection
 | |
|             if ($this->_compile_unicode) {
 | |
|                 if ($this->_unicode_skip_symbols
 | |
|                         && strlen($char) == 1
 | |
|                         && ($char < 'A' || $char > 'z'
 | |
|                         || ($char > 'Z' && $char < 'a'))
 | |
|                         && $char != "'") {  // does not skip the apostrophe
 | |
|                                             // since it's included in the language
 | |
|                                             // models
 | |
| 
 | |
|                     $skipped_count++;
 | |
|                     continue;
 | |
|                 }
 | |
| 
 | |
|                 // build an array of all the characters
 | |
|                 if (isset($unicode_chars[$char])) {
 | |
|                     $unicode_chars[$char]++;
 | |
|                 } else {
 | |
|                     $unicode_chars[$char] = 1;
 | |
|                 }
 | |
|             }
 | |
| 
 | |
|             // todo: add byte detection here
 | |
|         }
 | |
| 
 | |
|         // unicode cleanup
 | |
|         if ($this->_compile_unicode) {
 | |
|             foreach ($unicode_chars as $utf8_char => $count) {
 | |
|                 $search_result = $this->_unicode_block_name(
 | |
|                         $this->_utf8char2unicode($utf8_char), $blocks, $block_count);
 | |
| 
 | |
|                 if ($search_result != -1) {
 | |
|                     $block_name = $search_result[2];
 | |
|                 } else {
 | |
|                     $block_name = '[Malformatted]';
 | |
|                 }
 | |
| 
 | |
|                 if (isset($this->_unicode_blocks[$block_name])) {
 | |
|                     $this->_unicode_blocks[$block_name] += $count;
 | |
|                 } else {
 | |
|                     $this->_unicode_blocks[$block_name] = $count;
 | |
|                 }
 | |
|             }
 | |
|         }
 | |
| 
 | |
| 
 | |
|         // trigram cleanup
 | |
|         if ($this->_compile_trigram) {
 | |
|             // pad the end
 | |
|             if ($b != ' ') {
 | |
|                 if (!isset($this->_trigram["$a$b "])) {
 | |
|                     $this->_trigram["$a$b "] = 1;
 | |
|                 } else {
 | |
|                     $this->_trigram["$a$b "]++;
 | |
|                 }
 | |
|             }
 | |
| 
 | |
|             // perl compatibility; Language::Guess does not pad the beginning
 | |
|             // kludge
 | |
|             if (isset($dropone)) {
 | |
|                 if ($this->_trigram[$dropone] == 1) {
 | |
|                     unset($this->_trigram[$dropone]);
 | |
|                 } else {
 | |
|                     $this->_trigram[$dropone]--;
 | |
|                 }
 | |
|             }
 | |
| 
 | |
|             if (!empty($this->_trigram)) {
 | |
|                 $this->_trigram_ranks = $this->_arr_rank($this->_trigram);
 | |
|             } else {
 | |
|                 $this->_trigram_ranks = array();
 | |
|             }
 | |
|         }
 | |
|     }
 | |
| }
 | |
| 
 | |
| /* vim: set expandtab tabstop=4 shiftwidth=4 softtabstop=4: */
 | |
| 
 | |
| ?>
 |