<?php

/*
// Simple META Management Suite for Joomla 1.5.x - Version 1.5.4 [Dragonbane]
//
// Licence:
//		AGPLv3 - http://www.aqsg.com.au/products/simple-meta-management-suite/licence.html
//
// Author:
//		Joel Bassett - http://www.aqsg.com.au
//
// Credits:
//		http://www.aqsg.com.au/products/simple-meta-management-suite/credits.html
*/

// no direct access
defined( '_JEXEC' ) or die( 'Restricted access' );

/**
 * Projectname .......... Multibyte Keyword Generator
 * Version .............. 0.9
 * Last modified ........ 2009-11-05
 * Author(s) ............ Peter Kahl, www.dezzignz.com <bf91da40_AT_gmail.com>
 *                        Ver Pangonilo <smp_AT_itsp.info>
 * Copyright (c) ........ 2009 Peter Kahl
 *                        2006 Ver Pangonilo
 *                        All Rights Reserved
 * GNU General Public License (Version 2, June 1991)
 */

class AutoKeywordGenerator {

	//declare variables
	var $contents;
	var $encoding;
	var $lang;
	var $ignore; // array; languages to ignore
	
	//the generated keywords
	var $keywords;
	
	//minimum word length for inclusion into the single word metakeys
	var $wordLengthMin;
	var $wordOccuredMin;
	
	//minimum word length for inclusion into the 2-word phrase metakeys
	var $word2WordPhraseLengthMin;
	var $phrase2WordLengthMinOccur;
	
	//minimum word length for inclusion into the 3-word phrase metakeys
	var $word3WordPhraseLengthMin;
	
	//minimum phrase length for inclusion into the 2-word phrase metakeys
	var $phrase2WordLengthMin;
	var $phrase3WordLengthMinOccur;
	
	//minimum phrase length for inclusion into the 3-word phrase metakeys
	var $phrase3WordLengthMin;

	function AutoKeywordGenerator ($params) {
		
		// language or default langauge; if not defined
		if (!isset($params['lang'])) $this->lang = 'en';
		else $this->lang = strtolower($params['lang']); // case insensitive
		
		// multibyte internal encoding
		if (!isset($params['encoding'])) $this->encoding = 'UTF-8';
		else $this->encoding = strtoupper($params['encoding']); // case insensitive
		mb_internal_encoding($this->encoding);
		
		// languages to ignore
		if (isset($params['ignore']) && is_array($params['ignore'])) $this->ignore = $params['ignore']; // array of language codes
		else $this->ignore = false;
		
		// clean up input string; break along punctuations; explode into array
		if ($this->ignore !== false && in_array($this->lang, $this->ignore)) $this->contents = false; // language to be ignored
		else $this->contents = $this->process_text($params['content']);

		// single keyword
		if (isset($params['min_word_length'])) { // value 0 means disable
			$this->wordLengthMin  = $params['min_word_length'];
		} else {
			// if not set, use this default
			$this->wordLengthMin = 5;
		}
		
		if (isset($params['min_word_occur'])) {
			$this->wordOccuredMin = $params['min_word_occur'];
		} else {
			// if not set, use this default
			$this->wordOccuredMin = 3;
		}

		if (isset($params['min_2words_length']) && $params['min_2words_length'] == 0) { // value 0 means disable
			$this->word2WordPhraseLengthMin  = false;
		} elseif (isset($params['min_2words_length']) && $params['min_2words_length'] !== 0) {
			$this->word2WordPhraseLengthMin  = $params['min_2words_length'];
			$this->phrase2WordLengthMin      = $params['min_2words_phrase_length'];
			$this->phrase2WordLengthMinOccur = $params['min_2words_phrase_occur'];
		} else {
			// if not set, use these defaults
			$this->word2WordPhraseLengthMin  = 4;
			$this->phrase2WordLengthMin      = 8;
			$this->phrase2WordLengthMinOccur = 3;
		}

		if (isset($params['min_3words_length']) && $params['min_3words_length'] == 0) { // value 0 means disable
			$this->word3WordPhraseLengthMin  = false;
		} elseif (isset($params['min_3words_length']) && $params['min_3words_length'] !== 0) {
			$this->word3WordPhraseLengthMin  = $params['min_3words_length'];
			$this->phrase3WordLengthMin      = $params['min_3words_phrase_length'];
			$this->phrase3WordLengthMinOccur = $params['min_3words_phrase_occur'];
		} else {
			// if not set, use these defaults
			$this->word3WordPhraseLengthMin  = 4;
			$this->phrase3WordLengthMin      = 12;
			$this->phrase3WordLengthMinOccur = 3;
		}
		
		if (isset($params['blacklist'])) {
			$this->blacklist = $params['blacklist'];
		} else {
			// if not set, use this default
			$this->blacklist = '';
		}
		
	}

	function get_keywords () {		
		if ($this->contents === false) return '';
		
		$onew_arr = $this->parse_words();
		
		$twow_arr = $this->parse_2words();
		
		$thrw_arr = $this->parse_3words();
		
		// remove 2-word phrases if same single words exist
		if ($onew_arr !== false && $twow_arr !== false) {
			$cnt = count($onew_arr);
			for ($i = 0; $i < $cnt-1; $i++) {
				foreach ($twow_arr as $key => $phrase) {
					if ($onew_arr[$i] .' '. $onew_arr[$i+1] === $phrase) unset($twow_arr[$key]);
					}
				}
			}
		
		// remove 3-word phrases if same single words exist
		if ($onew_arr !== false && $thrw_arr !== false) {
			$cnt = count($onew_arr);
			for ($i = 0; $i < $cnt-2; $i++) {
				foreach ($thrw_arr as $key => $phrase) {
					if ($onew_arr[$i] .' '. $onew_arr[$i+1] .' '. $onew_arr[$i+2] === $phrase) unset($thrw_arr[$key]);
					}
				}
			}
		
		// remove duplicate ENGLISH plural words
		if ($this->lang == 'en') {
			if ($onew_arr !== false) {
				$cnt = count($onew_arr);
				for ($i = 0; $i < $cnt-1; $i++) {
					for ($j = $i+1; $j < $cnt; $j++) {
						if (array_key_exists($i, $onew_arr) && array_key_exists($j, $onew_arr)) {
							if ($onew_arr[$i].'s' == $onew_arr[$j]) unset($onew_arr[$j]);
							if (array_key_exists($j, $onew_arr)) {
								if ($onew_arr[$i] == $onew_arr[$j].'s') unset($onew_arr[$i]);
								}
							}
						}
					}
				}
			}
		
		// ready for output - implode arrays
		if ($onew_arr !== false) {$onew_kw = implode(',', $onew_arr) .',';}
		else {$onew_kw = '';}
		
		if ($twow_arr !== false) {$twow_kw = implode(',', $twow_arr) .',';}
		else {$twow_kw = '';}
		
		if ($thrw_arr !== false) {$thrw_kw = implode(',', $thrw_arr) .',';}
		else {$thrw_kw = '';}
		
		$keywords = $onew_kw . $twow_kw . $thrw_kw;
		//GEORGE BARDIS removing the blacklisted keywords form array
		if ($this->blacklist != '') {
		
			$keywordsArray = explode(',', $keywords);
			$blacklistArray = explode(',', $this->blacklist);
			
			foreach ($keywordsArray as $tempArray=>$currentKeyword) {
			
				foreach ($blacklistArray as $currentBlacklisted) {
				
					if ($currentKeyword == $currentBlacklisted){
						$keywordsArray[$tempArray] = '';
					}
				
				}
				
			}
			
		}
		//GEORGE BARDIS ENDS
		
		$keywords = implode(',', $keywordsArray) .',';
		
		return rtrim($keywords, ',');
	}

	function process_text ($str) {
		$db =& JFactory::getDBO();
		
		if (preg_match('/^\s*$/', $str)) return false;
		
		// strip HTML
		$str = $this->html2txt($str);
		
		//convert all characters to lower case
		$str = mb_strtolower($str, $this->encoding);
		
		// some cleanup
		$str = ' '. $str .' '; // pad that is necessary
		$str = preg_replace('#\ [a-z]{1,2}\ #i', ' ', $str); // remove 2 letter words and numbers
		$str = preg_replace('#[0-9\,\.:]#', '', $str); // remove numerals, including commas and dots that are part of the numeral
		$str = preg_replace("/([a-z]{2,})'s/", '\\1', $str); // remove only the 's (as in mother's)
		$str = str_replace('-', ' ', $str); // remove hyphens (-)
		
		$query	= 'SELECT DISTINCT word FROM #__smms_stopwords ORDER BY word';		
		$db->setQuery($query);
		$common = $db->loadObjectList();
		
		if ((isset($common)) && (count($common) != 0)) {
			foreach ($common as $word) {
				$str = str_replace(' ' . $word->word . ' ', ' ', $str);
			}
			unset($common);
		}
		
		// replace multiple whitespaces
		$str = preg_replace('/\s\s+/', ' ', $str);
		$str = trim($str);
		
		if (preg_match('/^\s*$/', $str)) return false;
		
		$arrA = explode("\n", $str);
		foreach ($arrA as $key => $value) {
			if (strpos($value, '.') !== false) $arrB[$key] = explode('.', $value);
			else $arrB[$key] = $value;
			}
		$arrB = $this->array_one_dimension($arrB);
		unset($arrA);
		foreach ($arrB as $key => $value) {
			if (strpos($value, '!') !== false) $arrC[$key] = explode('!', $value);
			else $arrC[$key] = $value;
			}
		$arrC = $this->array_one_dimension($arrC);
		unset($arrB);
		foreach ($arrC as $key => $value) {
			if (strpos($value, '?') !== false) $arrD[$key] = explode('?', $value);
			else $arrD[$key] = $value;
			}
		$arrD = $this->array_one_dimension($arrD);
		unset($arrC);
		foreach ($arrD as $key => $value) {
			if (strpos($value, ',') !== false) $arrE[$key] = explode(',', $value);
			else $arrE[$key] = $value;
			}
		$arrE = $this->array_one_dimension($arrE);
		unset($arrD);
		foreach ($arrE as $key => $value) {
			if (strpos($value, ';') !== false) $arrF[$key] = explode(';', $value);
			else $arrF[$key] = $value;
			}
		$arrF = $this->array_one_dimension($arrF);
		unset($arrE);
		
		return $arrF;
	}

	function parse_words () {
		if ($this->wordLengthMin === 0) return false; // 0 means disable
		
		$str = implode(' ', $this->contents);
		$str = $this->stripPunctuations($str);
		
		// create an array out of the site contents
		$s = explode(' ', $str);
		
		// iterate inside the array
		foreach($s as $key => $val) {
			if (mb_strlen($val, $this->encoding) >= $this->wordLengthMin) $k[] = $val;
		}

		if (!isset($k)) return false;
		
		// count the words; this is the real magic!
		$k = array_count_values($k);
		
		return $this->occure_filter($k, $this->wordOccuredMin);
	}

	function parse_2words () {
		if ($this->word2WordPhraseLengthMin === false) return false; // 0 means disable
		
		foreach ($this->contents as $key => $str) {
			$str = $this->stripPunctuations($str);
			$arr[$key] = explode(' ', $str); // 2-dimensional array
			}
		
		$z = 0; // key of the 2-word array
		$lines = count($arr);
		for ($a = 0; $a < $lines; $a++) {
			$words = count($arr[$a]);
			for ($i = 0; $i < $words-1; $i++) {
				if ((mb_strlen($arr[$a][$i], $this->encoding) >= $this->word2WordPhraseLengthMin) && (mb_strlen($arr[$a][$i+1], $this->encoding) >= $this->word2WordPhraseLengthMin)) {
					$y[$z] = $arr[$a][$i] ." ". $arr[$a][$i+1];
					$z++;
				}
			}
		}

		if (!isset($y)) return false;
		
		// count the words; this is the real magic!
		$y = array_count_values($y);

		return $this->occure_filter($y, $this->phrase2WordLengthMinOccur);
	}

	function parse_3words () {
		if ($this->word3WordPhraseLengthMin === false) return false; // 0 means disable
		
		foreach ($this->contents as $key => $str) {
			$str = $this->stripPunctuations($str);
			$arr[$key] = explode(' ', $str); // 2-dimensional array
		}
		
		$z = 0; // key of the 3-word array
		$lines = count($arr);
		for ($a = 0; $a < $lines; $a++) {
			$words = count($arr[$a]);
			for ($i = 0; $i < $words-2; $i++) {
				if ((mb_strlen($arr[$a][$i], $this->encoding) >= $this->word3WordPhraseLengthMin) && (mb_strlen($arr[$a][$i+1], $this->encoding) >= $this->word3WordPhraseLengthMin) && (mb_strlen($arr[$a][$i+2], $this->encoding) >= $this->word3WordPhraseLengthMin)) {
					$y[$z] = $arr[$a][$i] ." ". $arr[$a][$i+1] ." ". $arr[$a][$i+2];
					$z++;
				}
			}
		}

		if (!isset($y)) return false;

		// count the words; this is the real magic!
		$y = array_count_values($y);
		
		return $this->occure_filter($y, $this->phrase3WordLengthMinOccur);
	}

	function occure_filter ($array, $min) {
		$cnt = 0;
		foreach ($array as $word => $occured) {
			if ($occured >= $min) {
				$new[$cnt] = $word;
				$cnt++;
			}
		}
		if (isset($new)) return $new;
		return false;
	}

	function array_one_dimension ($array) {
		if (is_array($array)) {
			$cnt = count($array);
			$q = 0; // index of new array
			for ($n = 0; $n < $cnt; $n++) {
				if (is_array($array[$n])) {
					$cnt2 = count($array[$n]);
					for ($m = 0; $m < $cnt2; $m++) {
						if (is_array($array[$n][$m])) {
							$cnt3 = count($array[$n][$m]);
							for ($p = 0; $p < $cnt3; $p++) {
								$new[$q] = $array[$n][$m][$p];
								$q++;
								}
						} else {
							$new[$q] = $array[$n][$m];
							$q++;
						}
					}
				} else {
					$new[$q] = $array[$n];
					$q++;
				}
			}
			return $new; // flattened array
		}
		return $array; // because the input was not array
	}

	function html2txt ($str) {
		if ($str == '') return '';
		$str = preg_replace("#(<br>|<br\s?/>){1,}#i", " \n", $str); // replace <br> with \n
		$str = preg_replace("#<head>(.*?)</head>#s", " ", $str);
		$str = preg_replace("#<script(.*?)</script>#s", " ", $str);
		$str = preg_replace("#<style(.*?)</style>#s", " ", $str);
		$str = preg_replace("#<link(.*?)/>#s", " ", $str);
		$str = preg_replace("#</p>(\n)*<p>#i", " \n", $str); // we use \n to segment words
		$str = preg_replace("#(\n){2,}#", " \n", $str); // replace multiple with single line breaks
		$str = $this->strip_html_tags($str);
		$str = strip_tags($str);
		$unwanted = array('"', '“', '„', '<', '>', '/', '*', '[', ']', '+', '=', '#');
		$str = str_replace($unwanted, ' ', $str);
		$str = preg_replace('/&nbsp;/i', ' ', $str); // remove &nbsp;
		$str = preg_replace('/&[a-z]{2,5};/i', '', $str); // remove &trade;  &copy;
		$str = preg_replace('/\s\s+/', ' ', $str); // replace multiple white spaces
		return trim($str);
	}
	
	function stripPunctuations ($str) {
		if ($str == '') return '';
		// edit as needed
		$punctuations = array('"', "'", '’', '˝', '„', '`', '.', ',', ';', ':', '+', '±', '-', '_', '=', '(', ')', '[', ']', '<', '>', '{', '}', '/', '\\', '|', '?', '!', '@', '#', '%', '^', '&', '§', '$', '¢', '£', '€', '¥', '*', '~', '。','，','、','；','：','？','！','…','—','·','ˉ','ˇ','¨','‘','’','“','”','々','～','‖','∶','＂','＇','｀','｜','〃','〔','〕','〈','〉','《','》','「','」','『','』','．','〖','〗','【','】','（','）','［','］','｛','｝');
		$str = str_replace($punctuations, ' ', $str);
		return preg_replace('/\s\s+/', ' ', $str);
	}
	
	function strip_html_tags($str) {
		$text = preg_replace(
			array(
				// Remove invisible content
				'@<head[^>]*?>.*?</head>@siu',
				'@<style[^>]*?>.*?</style>@siu',
				'@<script[^>]*?.*?</script>@siu',
				'@<object[^>]*?.*?</object>@siu',
				'@<embed[^>]*?.*?</embed>@siu',
				'@<applet[^>]*?.*?</applet>@siu',
				'@<noframes[^>]*?.*?</noframes>@siu',
				'@<noscript[^>]*?.*?</noscript>@siu',
				'@<noembed[^>]*?.*?</noembed>@siu',

				// Add line breaks before & after blocks
				'@<((br)|(hr))@iu',
				'@</?((address)|(blockquote)|(center)|(del))@iu',
				'@</?((div)|(h[1-9])|(ins)|(isindex)|(p)|(pre))@iu',
				'@</?((dir)|(dl)|(dt)|(dd)|(li)|(menu)|(ol)|(ul))@iu',
				'@</?((table)|(th)|(td)|(caption))@iu',
				'@</?((form)|(button)|(fieldset)|(legend)|(input))@iu',
				'@</?((label)|(select)|(optgroup)|(option)|(textarea))@iu',
				'@</?((frameset)|(frame)|(iframe))@iu',
			),
			array(
				' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ',
				"\n\$0", "\n\$0", "\n\$0", "\n\$0", "\n\$0", "\n\$0",
				"\n\$0", "\n\$0",
			),	
			$str );

		// Remove all remaining tags and comments and return.
		return strip_tags( $str );
	}

	
	function removeDuplicateKw ($str) {
		if ($str == '') return $str;
		$str = trim(mb_strtolower($str));
		$kw_arr = explode(',', $str); // array
		foreach ($kw_arr as $key => $val) {
			$kw_arr[$key] = trim($val);
			if ($kw_arr[$key] == '') unset($kw_arr[$key]);
			}
		$kw_arr = array_unique($kw_arr);
		// remove duplicate ENGLISH plural words
		if ($this->lang == 'en') {
			$cnt = count($kw_arr);
			for ($i = 0; $i < $cnt; $i++) {
				for ($j = $i+1; $j < $cnt; $j++) {
					if (array_key_exists($i, $kw_arr) && array_key_exists($j, $kw_arr)) {
						if ($kw_arr[$i].'s' == $kw_arr[$j]) unset($kw_arr[$j]);
						if (array_key_exists($j, $kw_arr)) {
							if ($kw_arr[$i] == $kw_arr[$j].'s') unset($kw_arr[$i]);
							}
						}
					$kw_arr = array_values($kw_arr);
					}
				$kw_arr = array_values($kw_arr);
				}
			}
		// job is done!
		return implode(',', $kw_arr);
	}
	
}


?>