From f669be9b2b84daf5f7689b9e2839a25536d470f6 Mon Sep 17 00:00:00 2001 From: onli Date: Sun, 9 Aug 2020 11:10:41 +0200 Subject: [PATCH] bayes 1.0: cleanup and code update version --- serendipity_event_spamblock_bayes/ChangeLog | 9 +- .../serendipity_event_spamblock_bayes.css | 195 -- serendipity_event_spamblock_bayes/b8/b8.php | 406 ++++ .../b8/degenerator/standard.php | 176 ++ .../b8/lexer/standard.php | 267 +++ .../b8/storage/dba.php | 105 + .../b8/storage/mysql.php | 110 + .../b8/storage/sqlite.php | 108 + .../b8/storage/storage_base.php | 316 +++ .../bayesAnalysis.tpl | 27 - .../bayesAnalysismenu.tpl | 73 - .../bayesDBmenu.tpl | 112 - .../bayesImportmenu.tpl | 31 - .../bayesLearnmenu.tpl | 86 - .../bayesNavigation.tpl | 32 - .../bayesRecyclermenu.tpl | 110 +- .../bayes_commentlist.js | 309 +-- .../details.polyfill.min.js | 1 - .../jquery.excerpt.js | 166 -- .../jquery.heatcolor.js | 7 - .../jquery.tablesorter.js | 2 - serendipity_event_spamblock_bayes/load.gif | Bin 673 -> 0 bytes .../publicTrojaKey.pem | 6 - .../serendipity_event_spamblock_bayes.css | 138 -- .../serendipity_event_spamblock_bayes.js | 72 - .../serendipity_event_spamblock_bayes.php | 2015 ++--------------- 26 files changed, 1730 insertions(+), 3149 deletions(-) delete mode 100644 serendipity_event_spamblock_bayes/admin/serendipity_event_spamblock_bayes.css create mode 100644 serendipity_event_spamblock_bayes/b8/b8.php create mode 100644 serendipity_event_spamblock_bayes/b8/degenerator/standard.php create mode 100644 serendipity_event_spamblock_bayes/b8/lexer/standard.php create mode 100644 serendipity_event_spamblock_bayes/b8/storage/dba.php create mode 100644 serendipity_event_spamblock_bayes/b8/storage/mysql.php create mode 100644 serendipity_event_spamblock_bayes/b8/storage/sqlite.php create mode 100644 serendipity_event_spamblock_bayes/b8/storage/storage_base.php delete mode 100644 serendipity_event_spamblock_bayes/bayesAnalysis.tpl delete mode 100644 serendipity_event_spamblock_bayes/bayesAnalysismenu.tpl delete mode 100644 serendipity_event_spamblock_bayes/bayesDBmenu.tpl delete mode 100644 serendipity_event_spamblock_bayes/bayesImportmenu.tpl delete mode 100644 serendipity_event_spamblock_bayes/bayesLearnmenu.tpl delete mode 100644 serendipity_event_spamblock_bayes/bayesNavigation.tpl delete mode 100644 serendipity_event_spamblock_bayes/details.polyfill.min.js delete mode 100644 serendipity_event_spamblock_bayes/jquery.excerpt.js delete mode 100644 serendipity_event_spamblock_bayes/jquery.heatcolor.js delete mode 100644 serendipity_event_spamblock_bayes/jquery.tablesorter.js delete mode 100644 serendipity_event_spamblock_bayes/load.gif delete mode 100644 serendipity_event_spamblock_bayes/publicTrojaKey.pem delete mode 100644 serendipity_event_spamblock_bayes/serendipity_event_spamblock_bayes.css delete mode 100644 serendipity_event_spamblock_bayes/serendipity_event_spamblock_bayes.js diff --git a/serendipity_event_spamblock_bayes/ChangeLog b/serendipity_event_spamblock_bayes/ChangeLog index d77ab413..0bbee9d2 100644 --- a/serendipity_event_spamblock_bayes/ChangeLog +++ b/serendipity_event_spamblock_bayes/ChangeLog @@ -1,4 +1,11 @@ -0.5.6: fix possible SQL exposure in email field of comment +1.0: + * Update from the forked old b8 version to a current + version of that library + * Code cleanup + * Remove seldomly used functions and legacy code, including + most of the backend UI + * Rewrite javascript used in the backend (for marking spam + and ham comments) to be a minimal block of jQuery code 0.5.5: Translation fixes (German). diff --git a/serendipity_event_spamblock_bayes/admin/serendipity_event_spamblock_bayes.css b/serendipity_event_spamblock_bayes/admin/serendipity_event_spamblock_bayes.css deleted file mode 100644 index 19a50533..00000000 --- a/serendipity_event_spamblock_bayes/admin/serendipity_event_spamblock_bayes.css +++ /dev/null @@ -1,195 +0,0 @@ -#bayesNav { - margin: 0; - padding: 0; -} -#bayesNav a { - display: block; -} -#bayesNav li:last-child a { - padding-right: 0.3em; -} -#bayesNav ul { - list-style-type: none; - margin: 0; - padding: 0; - border: 1px solid; -} -#bayesNav li { - display: inline-block; - margin: 0; - padding: 0.5em; - border-right: 1px solid; -} -#bayesNav h3 { - display: inline; - margin: 0; - padding: 0; - font-size: 1em; -} -#bayesContent { - width: 100%; -} -#bayesControls * { - margin-left: 1em; - margin-right: 1em; - margin-bottom: 1em; -} - -#bayesLearnTable { - padding-top: 1em; - margin-bottom: 0.8em; -} -#bayesLearnTable td { - vertical-align: top; -} -#bayesControls { - float: right; - border: 1px solid; - border-top: 0; - -moz-border-radius-bottomleft: 5px; - -webkit-border-bottom-left-radius: 5px; - -moz-border-radius-bottomright: 5px; - -webkit-border-bottom-right-radius: 5px; - max-width: 21%; - padding-top: 1em; -} -#bayesControls form { - margin: 0; -} -#bayesDatabase { - padding-top: 2em; - margin-left: 2em; -} -#bayesSavedValues { - padding-top: 2em; -} -#bayesDatabaseTable th { - cursor: pointer; - text-decoration: underline; -} -th { - border: 1px solid; -} -caption { - font-weight: bold; -} -#bayesSavedValuesTable td { - text-align: center; -} -#bayesRecyclerTable { - padding-top: 2em; - width: 78%; - table-layout: fixed; - -} -#bayesRecyclerTable td { - padding-top: 1em; - overflow: auto; -} -#bayesRecyclerTable th.select { - text-align: center; - width: 2em; -} -#bayesRecyclerTable td.select { - text-align: center; -} -.ratingBox { - border-bottom: 1px solid grey; -} -.commentPart { - margin-left: 5em; -} -.rating { - margin-left: 5em; - font-weight: bold; -} -.commentType { - font-weight: bold; -} -.finalRating { - padding-left: 3.3em; - font-weight: bold; - font-size: 1.5em; - border-bottom: 1px solid grey; -} -#bayesAnalysisList li { - margin: 1em; -} -label { - cursor: pointer; -} -.serendipityIconLinkRight { - left: 90%; - position: relative; -} -#bayesControls label, #bayesControls input { - display: block; -} -input[type="submit"] { - cursor: pointer; -} -.bayesTrojaButtons { - display: inline; -} -fieldset { - display: inline-block; -} -#trojaImport { - margin-left: 1.1em; -} -#bayesDatabaseTablePagination { - font-size: 0.8em; - margin-top: 0.2em; -} -a.curpage { - border: 1px dotted black !important; -} -#bayesDatabaseTablePagination a { - border: 1px solid black; - padding: 1px; -} -#bayesRecycler, #bayesAnalysis { - display: inline-block; -} -#bayesRecycler { - max-width: 80%; -} -.bayesRecyclerSummary { - display: inline; - vertical-align: middle; -} -input { - vertical-align: middle; -} -.bayesRecyclerSummary td{ - text-align: center; -} -.bayesRecyclerItem { - padding-top: 1em; -} -.bayesRecyclerList { - margin-left: 3em; -} -.bayesRecyclerList dt { - font-weight: bold; -} -.bayesRecyclerList dt:after { - content: ":"; -} -.bayesRecyclerList dt+dd { - margin-left: 2em; -} -.bayesRecyclerTableNavigation { - margin-top: 1em; -} -.bayesAnalysisTableNavigation { - margin-top: 1em; -} -#bayesAnalysisButton { - margin-left: 1em; -} -summary { - cursor: pointer; -} - - diff --git a/serendipity_event_spamblock_bayes/b8/b8.php b/serendipity_event_spamblock_bayes/b8/b8.php new file mode 100644 index 00000000..43c0e1f6 --- /dev/null +++ b/serendipity_event_spamblock_bayes/b8/b8.php @@ -0,0 +1,406 @@ + + + b8 - A statistical ("Bayesian") spam filter written in PHP + + This program is free software; you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published by + the Free Software Foundation in version 2.1 of the License. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public + License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with this program; if not, write to the Free Software Foundation, + Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. +*/ + +/** + * The b8 spam filter library + * + * @license LGPL 2.1 + * @package b8 + * @author Tobias Leupold + * @author Oliver Lillie (original PHP 5 port) + */ + +namespace b8; + +spl_autoload_register( + function ($class) { + $parts = explode('\\', $class); + if ($parts[1]) { + require_once __DIR__ . DIRECTORY_SEPARATOR . $parts[1] . DIRECTORY_SEPARATOR . $parts[2] . '.php'; + } + } +); + +class b8 +{ + const DBVERSION = 3; + + const SPAM = 'spam'; + const HAM = 'ham'; + const LEARN = 'learn'; + const UNLEARN = 'unlearn'; + + const CLASSIFIER_TEXT_MISSING = 'CLASSIFIER_TEXT_MISSING'; + + const TRAINER_TEXT_MISSING = 'TRAINER_TEXT_MISSING'; + const TRAINER_CATEGORY_MISSING = 'TRAINER_CATEGORY_MISSING'; + const TRAINER_CATEGORY_FAIL = 'TRAINER_CATEGORY_FAIL'; + + const INTERNALS_TEXTS = 'b8*texts'; + const INTERNALS_DBVERSION = 'b8*dbversion'; + + const KEY_DB_VERSION = 'dbversion'; + const KEY_COUNT_HAM = 'count_ham'; + const KEY_COUNT_SPAM = 'count_spam'; + const KEY_TEXTS_HAM = 'texts_ham'; + const KEY_TEXTS_SPAM = 'texts_spam'; + + private $config = [ 'lexer' => 'standard', + 'degenerator' => 'standard', + 'storage' => 'dba', + 'use_relevant' => 15, + 'min_dev' => 0.2, + 'rob_s' => 0.3, + 'rob_x' => 0.5 ]; + + private $storage = null; + private $lexer = null; + private $degenerator = null; + private $token_data = null; + + /** + * Constructs b8 + * + * @access public + * @param array b8's configuration: [ 'lexer' => string, + 'degenerator' => string, + 'storage' => string, + 'use_relevant' => int, + 'min_dev' => float, + 'rob_s' => float, + 'rob_x' => float ] + * @param array The storage backend's config (depending on the backend used) + * @param array The lexer's config (depending on the lexer used) + * @param array The degenerator's config (depending on the degenerator used) + * @return void + */ + function __construct(array $config = [], + array $config_storage = [], + array $config_lexer = [], + array $config_degenerator = []) + { + // Validate config data + foreach ($config as $name => $value) { + switch ($name) { + case 'min_dev': + case 'rob_s': + case 'rob_x': + $this->config[$name] = (float) $value; + break; + case 'use_relevant': + $this->config[$name] = (int) $value; + break; + case 'lexer': + case 'degenerator': + case 'storage': + $this->config[$name] = (string) $value; + break; + default: + throw new \Exception(b8::class . ": Unknown configuration key: \"$name\""); + } + } + + // Setup the degenerator class + $class = '\\b8\\degenerator\\' . $this->config['degenerator']; + $this->degenerator = new $class($config_degenerator); + + // Setup the lexer class + $class = '\\b8\\lexer\\' . $this->config['lexer']; + $this->lexer = new $class($config_lexer); + + // Setup the storage backend + $class = '\\b8\\storage\\' . $this->config['storage']; + $this->storage = new $class($config_storage, $this->degenerator); + } + + /** + * Classifies a text + * + * @access public + * @param string The text to classify + * @return mixed float The rating between 0 (ham) and 1 (spam) or an error code + */ + public function classify(string $text = null) + { + // Let's first see if the user called the function correctly + if ($text === null) { + return \b8\b8::CLASSIFIER_TEXT_MISSING; + } + + // Get the internal database variables, containing the number of ham and spam texts so the + // spam probability can be calculated in relation to them + $internals = $this->storage->get_internals(); + + // Calculate the spaminess of all tokens + + // Get all tokens we want to rate + $tokens = $this->lexer->get_tokens($text); + + // Check if the lexer failed (if so, $tokens will be a lexer error code, if not, $tokens + // will be an array) + if (! is_array($tokens)) { + return $tokens; + } + + // Fetch all available data for the token set from the database + $this->token_data = $this->storage->get(array_keys($tokens)); + + // Calculate the spaminess and importance for each token (or a degenerated form of it) + + $word_count = []; + $rating = []; + $importance = []; + + foreach ($tokens as $word => $count) { + $word_count[$word] = $count; + + // Although we only call this function only here ... let's do the calculation stuff in a + // function to make this a bit less confusing ;-) + $rating[$word] = $this->get_probability($word, $internals); + $importance[$word] = abs(0.5 - $rating[$word]); + } + + // Order by importance + arsort($importance); + reset($importance); + + // Get the most interesting tokens (use all if we have less than the given number) + $relevant = []; + for ($i = 0; $i < $this->config['use_relevant']; $i++) { + if ($token = key($importance)) { + // Important tokens remain + + // If the token's rating is relevant enough, use it + if (abs(0.5 - $rating[$token]) > $this->config['min_dev']) { + // Tokens that appear more than once also count more than once + for ($x = 0, $l = $word_count[$token]; $x < $l; $x++) { + array_push($relevant, $rating[$token]); + } + } + } else { + // We have less words as we want to use, so we already use what we have and can + // break here + break; + } + + next($importance); + } + + // Calculate the spaminess of the text (thanks to Mr. Robinson ;-) + + // We set both haminess and spaminess to 1 for the first multiplying + $haminess = 1; + $spaminess = 1; + + // Consider all relevant ratings + foreach ($relevant as $value) { + $haminess *= (1.0 - $value); + $spaminess *= $value; + } + + // If no token was good for calculation, we really don't know how to rate this text, so + // we can return 0.5 without further calculations. + if ($haminess == 1 && $spaminess == 1) { + return 0.5; + } + + // Calculate the combined rating + + // Get the number of relevant ratings + $n = count($relevant); + + // The actual haminess and spaminess + $haminess = 1 - pow($haminess, (1 / $n)); + $spaminess = 1 - pow($spaminess, (1 / $n)); + + // Calculate the combined indicator + $probability = ($haminess - $spaminess) / ($haminess + $spaminess); + + // We want a value between 0 and 1, not between -1 and +1, so ... + $probability = (1 + $probability) / 2; + + // Alea iacta est + return $probability; + } + + /** + * Calculate the spaminess of a single token also considering "degenerated" versions + * + * @access private + * @param string The word to rate + * @param array The "internals" array + * @return float The word's rating + */ + private function get_probability(string $word, array $internals) + { + // Let's see what we have! + if (isset($this->token_data['tokens'][$word])) { + // The token is in the database, so we can use it's data as-is and calculate the + // spaminess of this token directly + return $this->calculate_probability($this->token_data['tokens'][$word], $internals); + } + + // The token was not found, so do we at least have similar words? + if (isset($this->token_data['degenerates'][$word])) { + // We found similar words, so calculate the spaminess for each one and choose the most + // important one for the further calculation + + // The default rating is 0.5 simply saying nothing + $rating = 0.5; + + foreach ($this->token_data['degenerates'][$word] as $degenerate => $count) { + // Calculate the rating of the current degenerated token + $rating_tmp = $this->calculate_probability($count, $internals); + + // Is it more important than the rating of another degenerated version? + if(abs(0.5 - $rating_tmp) > abs(0.5 - $rating)) { + $rating = $rating_tmp; + } + } + + return $rating; + } else { + // The token is really unknown, so choose the default rating for completely unknown + // tokens. This strips down to the robX parameter so we can cheap out the freaky math + // ;-) + return $this->config['rob_x']; + } + } + + /** + * Do the actual spaminess calculation of a single token + * + * @access private + * @param array The token's data [ \b8\b8::KEY_COUNT_HAM => int, + \b8\b8::KEY_COUNT_SPAM => int ] + * @param array The "internals" array + * @return float The rating + */ + private function calculate_probability(array $data, array $internals) + { + // Calculate the basic probability as proposed by Mr. Graham + + // But: consider the number of ham and spam texts saved instead of the number of entries + // where the token appeared to calculate a relative spaminess because we count tokens + // appearing multiple times not just once but as often as they appear in the learned texts. + + $rel_ham = $data[\b8\b8::KEY_COUNT_HAM]; + $rel_spam = $data[\b8\b8::KEY_COUNT_SPAM]; + + if ($internals[\b8\b8::KEY_TEXTS_HAM] > 0) { + $rel_ham = $data[\b8\b8::KEY_COUNT_HAM] / $internals[\b8\b8::KEY_TEXTS_HAM]; + } + + if ($internals[\b8\b8::KEY_TEXTS_SPAM] > 0) { + $rel_spam = $data[\b8\b8::KEY_COUNT_SPAM] / $internals[\b8\b8::KEY_TEXTS_SPAM]; + } + + $rating = $rel_spam / ($rel_ham + $rel_spam); + + // Calculate the better probability proposed by Mr. Robinson + $all = $data[\b8\b8::KEY_COUNT_HAM] + $data[\b8\b8::KEY_COUNT_SPAM]; + return (($this->config['rob_s'] * $this->config['rob_x']) + ($all * $rating)) + / ($this->config['rob_s'] + $all); + } + + /** + * Check the validity of the category of a request + * + * @access private + * @param string The category + * @return void + */ + private function check_category(string $category) + { + return $category === \b8\b8::HAM || $category === \b8\b8::SPAM; + } + + /** + * Learn a reference text + * + * @access public + * @param string The text to learn + * @param string Either b8::SPAM or b8::HAM + * @return mixed void or an error code + */ + public function learn(string $text = null, string $category = null) + { + // Let's first see if the user called the function correctly + if ($text === null) { + return \b8\b8::TRAINER_TEXT_MISSING; + } + if ($category === null) { + return \b8\b8::TRAINER_CATEGORY_MISSING; + } + + return $this->process_text($text, $category, \b8\b8::LEARN); + } + + /** + * Unlearn a reference text + * + * @access public + * @param string The text to unlearn + * @param string Either b8::SPAM or b8::HAM + * @return mixed void or an error code + */ + public function unlearn(string $text = null, string $category = null) + { + // Let's first see if the user called the function correctly + if ($text === null) { + return \b8\b8::TRAINER_TEXT_MISSING; + } + if ($category === null) { + return \b8\b8::TRAINER_CATEGORY_MISSING; + } + + return $this->process_text($text, $category, \b8\b8::UNLEARN); + } + + /** + * Does the actual interaction with the storage backend for learning or unlearning texts + * + * @access private + * @param string The text to process + * @param string Either b8::SPAM or b8::HAM + * @param string Either b8::LEARN or b8::UNLEARN + * @return mixed void or an error code + */ + private function process_text(string $text, string $category, string $action) + { + // Look if the request is okay + if (! $this->check_category($category)) { + return \b8\b8::TRAINER_CATEGORY_FAIL; + } + + // Get all tokens from $text + $tokens = $this->lexer->get_tokens($text); + + // Check if the lexer failed (if so, $tokens will be a lexer error code, if not, $tokens + // will be an array) + if (! is_array($tokens)) { + return $tokens; + } + + // Pass the tokens and what to do with it to the storage backend + return $this->storage->process_text($tokens, $category, $action); + } + +} diff --git a/serendipity_event_spamblock_bayes/b8/degenerator/standard.php b/serendipity_event_spamblock_bayes/b8/degenerator/standard.php new file mode 100644 index 00000000..f532d987 --- /dev/null +++ b/serendipity_event_spamblock_bayes/b8/degenerator/standard.php @@ -0,0 +1,176 @@ + + + This file is part of the b8 package + + This program is free software; you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published by + the Free Software Foundation in version 2.1 of the License. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public + License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with this program; if not, write to the Free Software Foundation, + Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. +*/ + +/** + * A helper class to derive simplified tokens + * + * @license LGPL 2.1 + * @package b8 + * @author Tobias Leupold + */ + +namespace b8\degenerator; + +class standard +{ + public $config = [ 'multibyte' => true, + 'encoding' => 'UTF-8' ]; + + public $degenerates = []; + + /** + * Constructs the degenerator. + * + * @access public + * @param array $config The configuration: [ 'multibyte' => bool, + 'encoding' => string ] + * @return void + */ + public function __construct(array $config) + { + // Validate config data + foreach ($config as $name => $value) { + switch($name) { + case 'multibyte': + $this->config[$name] = (bool) $value; + break; + case 'encoding': + $this->config[$name] = (string) $value; + break; + default: + throw new \Exception(standard::class . ": Unknown configuration key: " + . "\"$name\""); + } + } + } + + /** + * Generates a list of "degenerated" words for a list of words. + * + * @access public + * @param array $words The words to degenerate + * @return array An array containing an array of degenerated tokens for each token + */ + public function degenerate(array $words) + { + $degenerates = []; + + foreach ($words as $word) { + $degenerates[$word] = $this->degenerate_word($word); + } + + return $degenerates; + } + + /** + * Remove duplicates from a list of degenerates of a word. + * + * @access private + * @param string $word The word + * @param array $list The list to process + * @return array The list without duplicates + */ + private function delete_duplicates(string $word, array $list) + { + $list_processed = []; + + // Check each upper/lower version + foreach ($list as $alt_word) { + if ($alt_word != $word) { + array_push($list_processed, $alt_word); + } + } + + return $list_processed; + } + + /** + * Builds a list of "degenerated" versions of a word. + * + * @access private + * @param string $word The word + * @return array An array of degenerated words + */ + private function degenerate_word(string $word) + { + // Check for any stored words so the process doesn't have to repeat + if (isset($this->degenerates[$word]) === true) { + return $this->degenerates[$word]; + } + + // Create different versions of upper and lower case + if ($this->config['multibyte'] === false) { + // The standard upper/lower versions + $lower = strtolower($word); + $upper = strtoupper($word); + $first = substr($upper, 0, 1) . substr($lower, 1, strlen($word)); + } elseif ($this->config['multibyte'] === true) { + // The multibyte upper/lower versions + $lower = mb_strtolower($word, $this->config['encoding']); + $upper = mb_strtoupper($word, $this->config['encoding']); + $first = mb_substr($upper, 0, 1, $this->config['encoding']) + . mb_substr($lower, 1, mb_strlen($word), $this->config['encoding']); + } + + // Add the versions + $upper_lower = []; + array_push($upper_lower, $lower); + array_push($upper_lower, $upper); + array_push($upper_lower, $first); + + // Delete duplicate upper/lower versions + $degenerate = $this->delete_duplicates($word, $upper_lower); + + // Append the original word + array_push($degenerate, $word); + + // Degenerate all versions + foreach ($degenerate as $alt_word) { + // Look for stuff like !!! and ??? + if (preg_match('/[!?]$/', $alt_word) > 0) { + // Add versions with different !s and ?s + if (preg_match('/[!?]{2,}$/', $alt_word) > 0) { + $tmp = preg_replace('/([!?])+$/', '$1', $alt_word); + array_push($degenerate, $tmp); + } + + $tmp = preg_replace('/([!?])+$/', '', $alt_word); + array_push($degenerate, $tmp); + } + + // Look for "..." at the end of the word + $alt_word_int = $alt_word; + while (preg_match('/[\.]$/', $alt_word_int) > 0) { + $alt_word_int = substr($alt_word_int, 0, strlen($alt_word_int) - 1); + array_push($degenerate, $alt_word_int); + } + } + + // Some degenerates are the same as the original word. These don't have to be fetched, so we + // create a new array with only new tokens + $degenerate = $this->delete_duplicates($word, $degenerate); + + // Store the list of degenerates for the token to prevent unnecessary re-processing + $this->degenerates[$word] = $degenerate; + + return $degenerate; + } + +} diff --git a/serendipity_event_spamblock_bayes/b8/lexer/standard.php b/serendipity_event_spamblock_bayes/b8/lexer/standard.php new file mode 100644 index 00000000..f6c8f6fd --- /dev/null +++ b/serendipity_event_spamblock_bayes/b8/lexer/standard.php @@ -0,0 +1,267 @@ + + + This file is part of the b8 package + + This program is free software; you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published by + the Free Software Foundation in version 2.1 of the License. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public + License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with this program; if not, write to the Free Software Foundation, + Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. +*/ + +/** + * A helper class to disassemble a text to tokens + * + * @license LGPL 2.1 + * @package b8 + * @author Tobias Leupold + * @author Oliver Lillie (original PHP 5 port) + */ + +namespace b8\lexer; + +class standard +{ + const LEXER_TEXT_NOT_STRING = 'LEXER_TEXT_NOT_STRING'; + const LEXER_TEXT_EMPTY = 'LEXER_TEXT_EMPTY'; + + const LEXER_NO_TOKENS = 'b8*no_tokens'; + + private $config = [ 'min_size' => 3, + 'max_size' => 30, + 'get_uris' => true, + 'get_html' => true, + 'get_bbcode' => false, + 'allow_numbers' => false ]; + + private $tokens = null; + private $processed_text = null; + + // The regular expressions we use to split the text to tokens + private $regexp = [ 'raw_split' => '/[\s,\.\/"\:;\|<>\-_\[\]{}\+=\)\(\*\&\^%]+/', + 'ip' => '/([A-Za-z0-9\_\-\.]+)/', + 'uris' => '/([A-Za-z0-9\_\-]*\.[A-Za-z0-9\_\-\.]+)/', + 'html' => '/(<.+?>)/', + 'bbcode' => '/(\[.+?\])/', + 'tagname' => '/(.+?)\s/', + 'numbers' => '/^[0-9]+$/' ]; + + /** + * Constructs the lexer. + * + * @access public + * @param array $config The configuration: [ 'min_size' => int, + * 'max_size' => int, + * 'get_uris' => bool, + * 'get_html' => bool, + * 'get_bbcode' => bool, + * 'allow_numbers' => bool ] + * @return void + */ + function __construct(array $config) + { + // Validate config data + foreach ($config as $name=>$value) { + switch ($name) { + case 'min_size': + case 'max_size': + $this->config[$name] = (int) $value; + break; + case 'allow_numbers': + case 'get_uris': + case 'get_html': + case 'get_bbcode': + $this->config[$name] = (bool) $value; + break; + default: + throw new \Exception(standard::class . ": Unknown configuration key: " + . "\"$name\""); + } + } + } + + /** + * Splits a text to tokens. + * + * @access public + * @param string $text The text to disassemble + * @return mixed Returns a list of tokens or an error code + */ + public function get_tokens(string $text) + { + // Check if we actually have a string ... + if (is_string($text) === false) { + return self::LEXER_TEXT_NOT_STRING; + } + + // ... and if it's empty + if (empty($text) === true) { + return self::LEXER_TEXT_EMPTY; + } + + // Re-convert the text to the original characters coded in UTF-8, as they have been coded in + // html entities during the post process + $this->processed_text = html_entity_decode($text, ENT_QUOTES, 'UTF-8'); + + // Reset the token list + $this->tokens = array(); + + if ($this->config['get_uris'] === true) { + // Get URIs + $this->get_uris($this->processed_text); + } + + if ($this->config['get_html'] === true) { + // Get HTML + $this->get_markup($this->processed_text, $this->regexp['html']); + } + + if ($this->config['get_bbcode'] === true) { + // Get BBCode + $this->get_markup($this->processed_text, $this->regexp['bbcode']); + } + + // We always want to do a raw split of the (remaining) text, so: + $this->raw_split($this->processed_text); + + // Be sure not to return an empty array + if (count($this->tokens) == 0) { + $this->tokens[self::LEXER_NO_TOKENS] = 1; + } + + // Return a list of all found tokens + return $this->tokens; + } + + /** + * Validates a token. + * + * @access private + * @param string $token The token string + * @return bool Returns true if the token is valid, otherwise returns false. + */ + private function is_valid(string $token) + { + // Just to be sure that the token's name won't collide with b8's internal variables + if (substr($token, 0, 3) == 'b8*') { + return false; + } + + // Validate the size of the token + $len = strlen($token); + if ($len < $this->config['min_size'] || $len > $this->config['max_size']) { + return false; + } + + // We may want to exclude pure numbers + if ($this->config['allow_numbers'] === false + && preg_match($this->regexp['numbers'], $token) > 0) { + + return false; + } + + // Token is okay + return true; + } + + /** + * Checks the validity of a token and adds it to the token list if it's valid. + * + * @access private + * @param string $token + * @param string $word_to_remove Word to remove from the processed string + * @return void + */ + private function add_token(string $token, string $word_to_remove = null) + { + // Check the validity of the token + if (! $this->is_valid($token)) { + return; + } + + // Add it to the list or increase it's counter + if (! isset($this->tokens[$token])) { + $this->tokens[$token] = 1; + } else { + $this->tokens[$token] += 1; + } + + // If requested, remove the word or it's original version from the text + if ($word_to_remove !== null) { + $this->processed_text = str_replace($word_to_remove, '', $this->processed_text); + } + } + + /** + * Gets URIs. + * + * @access private + * @param string $text + * @return void + */ + private function get_uris(string $text) + { + // Find URIs + preg_match_all($this->regexp['uris'], $text, $raw_tokens); + foreach ($raw_tokens[1] as $word) { + // Remove a possible trailing dot + $word = rtrim($word, '.'); + // Try to add the found tokens to the list + $this->add_token($word, $word); + // Also process the parts of the found URIs + $this->raw_split($word); + } + } + + /** + * Gets HTML or BBCode markup, depending on the regexp used. + * + * @access private + * @param string $text + * @param string $regexp + * @return void + */ + private function get_markup(string $text, string $regexp) + { + // Search for the markup + preg_match_all($regexp, $text, $raw_tokens); + foreach ($raw_tokens[1] as $word) { + $actual_word = $word; + + // If the tag has parameters, just use the tag itself + if (strpos($word, ' ') !== false) { + preg_match($this->regexp['tagname'], $word, $match); + $actual_word = $match[1]; + $word = "$actual_word..." . substr($word, -1); + } + + // Try to add the found tokens to the list + $this->add_token($word, $actual_word); + } + } + + /** + * Does a raw split. + * + * @access private + * @param string $text + * @return void + */ + private function raw_split(string $text) + { + foreach (preg_split($this->regexp['raw_split'], $text) as $word) { + // Check the word and add it to the token list if it's valid + $this->add_token($word); + } + } + +} diff --git a/serendipity_event_spamblock_bayes/b8/storage/dba.php b/serendipity_event_spamblock_bayes/b8/storage/dba.php new file mode 100644 index 00000000..f4681feb --- /dev/null +++ b/serendipity_event_spamblock_bayes/b8/storage/dba.php @@ -0,0 +1,105 @@ + + + This file is part of the b8 package + + This program is free software; you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published by + the Free Software Foundation in version 2.1 of the License. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public + License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with this program; if not, write to the Free Software Foundation, + Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. +*/ + +namespace b8\storage; + +/** + * A Berkeley DB (DBA) storage backend + * + * @license LGPL 2.1 + * @package b8 + * @author Tobias Leupold + */ + +class dba extends storage_base +{ + + private $db = null; + + protected function setup_backend(array $config) + { + if (! isset($config['resource']) + || gettype($config['resource']) !== 'resource' + || get_resource_type($config['resource']) !== 'dba') { + + throw new \Exception(dba::class . ": No valid DBA resource passed"); + } + $this->db = $config['resource']; + } + + protected function fetch_token_data(array $tokens) + { + $data = []; + + foreach ($tokens as $token) { + // Try to the raw data in the format "count_ham count_spam" + $count = dba_fetch($token, $this->db); + + if ($count !== false) { + // Split the data by space characters + $split_data = explode(' ', $count); + + // As an internal variable may have just one single value, we have to check for this + $count_ham = isset($split_data[0]) ? (int) $split_data[0] : null; + $count_spam = isset($split_data[1]) ? (int) $split_data[1] : null; + + // Append the parsed data + $data[$token] = [ \b8\b8::KEY_COUNT_HAM => $count_ham, + \b8\b8::KEY_COUNT_SPAM => $count_spam ]; + } + } + + return $data; + } + + private function assemble_count_value(array $count) + { + // Assemble the count data string + $count_value = $count[\b8\b8::KEY_COUNT_HAM] . ' ' . $count[\b8\b8::KEY_COUNT_SPAM]; + // Remove whitespace from data of the internal variables + return(rtrim($count_value)); + } + + protected function add_token(string $token, array $count) + { + return dba_insert($token, $this->assemble_count_value($count), $this->db); + } + + protected function update_token(string $token, array $count) + { + return dba_replace($token, $this->assemble_count_value($count), $this->db); + } + + protected function delete_token(string $token) + { + return dba_delete($token, $this->db); + } + + protected function start_transaction() + { + return; + } + + protected function finish_transaction() + { + return; + } + +} diff --git a/serendipity_event_spamblock_bayes/b8/storage/mysql.php b/serendipity_event_spamblock_bayes/b8/storage/mysql.php new file mode 100644 index 00000000..1758f8a7 --- /dev/null +++ b/serendipity_event_spamblock_bayes/b8/storage/mysql.php @@ -0,0 +1,110 @@ + + + This file is part of the b8 package + + This program is free software; you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published by + the Free Software Foundation in version 2.1 of the License. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public + License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with this program; if not, write to the Free Software Foundation, + Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. +*/ + +namespace b8\storage; + +/** + * A MySQL storage backend + * + * @license LGPL 2.1 + * @package b8 + * @author Tobias Leupold + */ + +class mysql extends storage_base +{ + + private $mysql = null; + private $table = null; + + protected function setup_backend(array $config) + { + if (! isset($config['resource']) + || get_class($config['resource']) !== 'mysqli') { + + throw new \Exception(mysql::class . ": No valid mysqli object passed"); + } + $this->mysql = $config['resource']; + + if (! isset($config['table'])) { + throw new \Exception(mysql::class . ": No b8 wordlist table name passed"); + } + $this->table = $config['table']; + } + + protected function fetch_token_data(array $tokens) + { + $data = []; + + $escaped = []; + foreach ($tokens as $token) { + $escaped[] = $this->mysql->real_escape_string($token); + } + $result = $this->mysql->query('SELECT token, count_ham, count_spam' + . ' FROM ' . $this->table + . ' WHERE token IN ' + . "('" . implode("','", $escaped) . "')"); + + while ($row = $result->fetch_row()) { + $data[$row[0]] = [ \b8\b8::KEY_COUNT_HAM => $row[1], + \b8\b8::KEY_COUNT_SPAM => $row[2] ]; + } + + $result->free_result(); + + return $data; + } + + protected function add_token(string $token, array $count) + { + $query = $this->mysql->prepare('INSERT INTO ' . $this->table + . '(token, count_ham, count_spam) VALUES(?, ?, ?)'); + $query->bind_param('sii', $token, $count[\b8\b8::KEY_COUNT_HAM], + $count[\b8\b8::KEY_COUNT_SPAM]); + $query->execute(); + } + + protected function update_token(string $token, array $count) + { + $query = $this->mysql->prepare('UPDATE ' . $this->table + . ' SET count_ham = ?, count_spam = ? WHERE token = ?'); + $query->bind_param('iis', $count[\b8\b8::KEY_COUNT_HAM], $count[\b8\b8::KEY_COUNT_SPAM], + $token); + $query->execute(); + } + + protected function delete_token(string $token) + { + $query = $this->mysql->prepare('DELETE FROM ' . $this->table . ' WHERE token = ?'); + $query->bind_param('s', $token); + $query->execute(); + } + + protected function start_transaction() + { + $this->mysql->begin_transaction(); + } + + protected function finish_transaction() + { + $this->mysql->commit(); + } + +} diff --git a/serendipity_event_spamblock_bayes/b8/storage/sqlite.php b/serendipity_event_spamblock_bayes/b8/storage/sqlite.php new file mode 100644 index 00000000..05c64dfe --- /dev/null +++ b/serendipity_event_spamblock_bayes/b8/storage/sqlite.php @@ -0,0 +1,108 @@ + + + This file is part of the b8 package + + This program is free software; you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published by + the Free Software Foundation in version 2.1 of the License. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public + License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with this program; if not, write to the Free Software Foundation, + Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. +*/ + +namespace b8\storage; +use PDO; + +/** + * A sqlite storage backend + * + * @license LGPL 2.1 + * @package b8 + * @author Tobias Leupold + */ + +class sqlite extends storage_base +{ + + private $sqlite = null; + private $table = null; + + protected function setup_backend(array $config) + { + $this->sqlite = $config['resource']; + + if (! isset($config['table'])) { + $config['table'] = 'b8_wordlist'; + } + $this->table = $config['table']; + } + + protected function fetch_token_data(array $tokens) + { + $data = []; + + $escaped = []; + foreach ($tokens as $token) { + $escaped[] = $this->sqlite->quote($token); + } + + $result = $this->sqlite->query('SELECT token, count_ham, count_spam' + . ' FROM ' . $this->table + . ' WHERE token IN ' + . "(" . implode(",", $escaped) . ")"); + + while ($row = $result->fetch()) { + $data[$row[0]] = [ \b8\b8::KEY_COUNT_HAM => $row[1], + \b8\b8::KEY_COUNT_SPAM => $row[2] ]; + } + + return $data; + } + + protected function add_token(string $token, array $count) + { + $query = $this->sqlite->prepare('INSERT INTO ' . $this->table + . '(token, count_ham, count_spam) VALUES(?, ?, ?)'); + $query->bindParam(1, $token, PDO::PARAM_STR); + $query->bindParam(2, $count[\b8\b8::KEY_COUNT_HAM], PDO::PARAM_INT); + $query->bindParam(3, $count[\b8\b8::KEY_COUNT_SPAM], PDO::PARAM_INT); + + $query->execute(); + } + + protected function update_token(string $token, array $count) + { + $query = $this->sqlite->prepare('UPDATE ' . $this->table + . ' SET count_ham = ?, count_spam = ? WHERE token = ?'); + $query->bindParam(1, $count[\b8\b8::KEY_COUNT_HAM], PDO::PARAM_INT); + $query->bindParam(2, $count[\b8\b8::KEY_COUNT_SPAM], PDO::PARAM_INT); + $query->bindParam(3, $token, PDO::PARAM_STR); + $query->execute(); + } + + protected function delete_token(string $token) + { + $query = $this->sqlite->prepare('DELETE FROM ' . $this->table . ' WHERE token = ?'); + $query->bindParam(1, $token, PDO::PARAM_STR); + $query->execute(); + } + + protected function start_transaction() + { + $this->sqlite->beginTransaction(); + } + + protected function finish_transaction() + { + $this->sqlite->commit(); + } + +} diff --git a/serendipity_event_spamblock_bayes/b8/storage/storage_base.php b/serendipity_event_spamblock_bayes/b8/storage/storage_base.php new file mode 100644 index 00000000..2d6dd16c --- /dev/null +++ b/serendipity_event_spamblock_bayes/b8/storage/storage_base.php @@ -0,0 +1,316 @@ + + + This file is part of the b8 package + + This program is free software; you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published by + the Free Software Foundation in version 2.1 of the License. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public + License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with this program; if not, write to the Free Software Foundation, + Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. +*/ + +/** + * Abstract base class for storage backends + * + * @license LGPL 2.1 + * @package b8 + * @author Tobias Leupold + */ + +namespace b8\storage; + +abstract class storage_base +{ + protected $degenerator = null; + + /** + * Sets up the backend + * + * @access public + * @param array The configuration for the respective backend + */ + abstract protected function setup_backend(array $config); + + /** + * Does the actual interaction with the database when fetching data + * + * @access protected + * @param array $tokens List of token names to fetch + * @return mixed Returns an array of the returned data in the format array(token => data) + or an empty array if there was no data. + */ + abstract protected function fetch_token_data(array $tokens); + + /** + * Stores a new token to the database + * + * @access protected + * @param string $token The token's name + * @param array $count The ham and spam counters [ \b8\b8::KEY_COUNT_HAM => int, + \b8\b8::KEY_COUNT_SPAM => int ] + * @return bool true on success or false on failure + */ + abstract protected function add_token(string $token, array $count); + + /** + * Updates an existing token + * + * @access protected + * @param string $token The token's name + * @param array $count The ham and spam counters [ \b8\b8::KEY_COUNT_HAM => int, + \b8\b8::KEY_COUNT_SPAM => int ] + * @return bool true on success or false on failure + */ + abstract protected function update_token(string $token, array $count); + + /** + * Removes a token from the database + * + * @access protected + * @param string $token The token's name + * @return bool true on success or false on failure + */ + abstract protected function delete_token(string $token); + + /** + * Starts a transaction (if the underlying database supports/needs this) + * + * @access protected + * @return void + */ + abstract protected function start_transaction(); + + /** + * Finishes a transaction (if the underlying database supports/needs this) + * + * @access protected + * @return void + */ + abstract protected function finish_transaction(); + + /** + * Passes the degenerator to the instance and calls the backend setup + * + * @access public + * @param array The respective backen's configuration + * @param object The degenerator to use + * @return void + */ + public function __construct(array $config, object $degenerator) + { + $this->degenerator = $degenerator; + $this->setup_backend($config); + + $internals = $this->get_internals(); + if (! isset($internals[\b8\b8::KEY_DB_VERSION]) + || $internals[\b8\b8::KEY_DB_VERSION] !== \b8\b8::DBVERSION) { + + throw new \Exception(storage_base::class . ': The connected database is not a b8 v' + . \b8\b8::DBVERSION . ' database.'); + } + } + + /** + * Get the database's internal variables. + * + * @access public + * @return array Returns an array of all internals. + */ + public function get_internals() + { + $internals = $this->fetch_token_data([ \b8\b8::INTERNALS_TEXTS, + \b8\b8::INTERNALS_DBVERSION ]); + + // Just in case this is called by check_database() and it's not yet clear if we actually + // have a b8 database + $texts_ham = null; + $texts_spam = null; + $dbversion = null; + if(isset($internals[\b8\b8::INTERNALS_TEXTS][\b8\b8::KEY_COUNT_HAM])) { + $texts_ham = (int) $internals[\b8\b8::INTERNALS_TEXTS][\b8\b8::KEY_COUNT_HAM]; + } + if(isset($internals[\b8\b8::INTERNALS_TEXTS][\b8\b8::KEY_COUNT_SPAM])) { + $texts_spam = (int) $internals[\b8\b8::INTERNALS_TEXTS][\b8\b8::KEY_COUNT_SPAM]; + } + if(isset($internals[\b8\b8::INTERNALS_DBVERSION][\b8\b8::KEY_COUNT_HAM])) { + $dbversion = (int) $internals[\b8\b8::INTERNALS_DBVERSION][\b8\b8::KEY_COUNT_HAM]; + } + + return [ \b8\b8::KEY_TEXTS_HAM => $texts_ham, + \b8\b8::KEY_TEXTS_SPAM => $texts_spam, + \b8\b8::KEY_DB_VERSION => $dbversion ]; + } + + /** + * Get all data about a list of tokens from the database. + * + * @access public + * @param array The tokens list + * @return mixed Returns False on failure, otherwise returns array of returned data + in the format [ 'tokens' => [ token => count ], + 'degenerates' => [ token => [ degenerate => count ] ] ]. + */ + public function get(array $tokens) + { + // First we see what we have in the database + $token_data = $this->fetch_token_data($tokens); + + // Check if we have to degenerate some tokens + $missing_tokens = array(); + foreach ($tokens as $token) { + if (! isset($token_data[$token])) { + $missing_tokens[] = $token; + } + } + + if (count($missing_tokens) > 0) { + // We have to degenerate some tokens + $degenerates_list = []; + + // Generate a list of degenerated tokens for the missing tokens ... + $degenerates = $this->degenerator->degenerate($missing_tokens); + + // ... and look them up + foreach ($degenerates as $token => $token_degenerates) { + $degenerates_list = array_merge($degenerates_list, $token_degenerates); + } + + $token_data = array_merge($token_data, $this->fetch_token_data($degenerates_list)); + } + + // Here, we have all available data in $token_data. + + $return_data_tokens = []; + $return_data_degenerates = []; + + foreach ($tokens as $token) { + if (isset($token_data[$token])) { + // The token was found in the database + $return_data_tokens[$token] = $token_data[$token]; + } else { + // The token was not found, so we look if we can return data for degenerated tokens + foreach ($this->degenerator->degenerates[$token] as $degenerate) { + if (isset($token_data[$degenerate])) { + // A degenertaed version of the token way found in the database + $return_data_degenerates[$token][$degenerate] = $token_data[$degenerate]; + } + } + } + } + + // Now, all token data directly found in the database is in $return_data_tokens and all + // data for degenerated versions is in $return_data_degenerates, so + return [ 'tokens' => $return_data_tokens, + 'degenerates' => $return_data_degenerates ]; + } + + /** + * Stores or deletes a list of tokens from the given category. + * + * @access public + * @param array The tokens list + * @param string Either \b8\b8::HAM or \b8\b8::SPAM + * @param string Either \b8\b8::LEARN or \b8\b8::UNLEARN + * @return void + */ + public function process_text(array $tokens, string $category, string $action) + { + // No matter what we do, we first have to check what data we have. + + // First get the internals, including the ham texts and spam texts counter + $internals = $this->get_internals(); + // Then, fetch all data for all tokens we have + $token_data = $this->fetch_token_data(array_keys($tokens)); + + $this->start_transaction(); + + // Process all tokens to learn/unlearn + foreach ($tokens as $token => $count) { + if (isset($token_data[$token])) { + // We already have this token, so update it's data + + // Get the existing data + $count_ham = $token_data[$token][\b8\b8::KEY_COUNT_HAM]; + $count_spam = $token_data[$token][\b8\b8::KEY_COUNT_SPAM]; + + // Increase or decrease the right counter + if ($action === \b8\b8::LEARN) { + if ($category === \b8\b8::HAM) { + $count_ham += $count; + } elseif ($category === \b8\b8::SPAM) { + $count_spam += $count; + } + } elseif ($action == \b8\b8::UNLEARN) { + if ($category === \b8\b8::HAM) { + $count_ham -= $count; + } elseif ($category === \b8\b8::SPAM) { + $count_spam -= $count; + } + } + + // We don't want to have negative values + if ($count_ham < 0) { + $count_ham = 0; + } + if ($count_spam < 0) { + $count_spam = 0; + } + + // Now let's see if we have to update or delete the token + if ($count_ham != 0 or $count_spam != 0) { + $this->update_token($token, [ \b8\b8::KEY_COUNT_HAM => $count_ham, + \b8\b8::KEY_COUNT_SPAM => $count_spam ]); + } else { + $this->delete_token($token); + } + } else { + // We don't have the token. If we unlearn a text, we can't delete it as we don't + // have it anyway, so just do something if we learn a text + if ($action === \b8\b8::LEARN) { + if ($category === \b8\b8::HAM) { + $this->add_token($token, [ \b8\b8::KEY_COUNT_HAM => $count, + \b8\b8::KEY_COUNT_SPAM => 0 ]); + } elseif ($category === \b8\b8::SPAM) { + $this->add_token($token, [ \b8\b8::KEY_COUNT_HAM => 0, + \b8\b8::KEY_COUNT_SPAM => $count ]); + } + } + } + } + + // Now, all token have been processed, so let's update the right text + if ($action === \b8\b8::LEARN) { + if ($category === \b8\b8::HAM) { + $internals[\b8\b8::KEY_TEXTS_HAM]++; + } elseif ($category === \b8\b8::SPAM) { + $internals[\b8\b8::KEY_TEXTS_SPAM]++; + } + } elseif ($action === \b8\b8::UNLEARN) { + if ($category === \b8\b8::HAM) { + if ($internals[\b8\b8::KEY_TEXTS_HAM] > 0) { + $internals[\b8\b8::KEY_TEXTS_HAM]--; + } + } elseif ($category === \b8\b8::SPAM) { + if ($internals[\b8\b8::KEY_TEXTS_SPAM] > 0) { + $internals[\b8\b8::KEY_TEXTS_SPAM]--; + } + } + } + + $this->update_token(\b8\b8::INTERNALS_TEXTS, + [ \b8\b8::KEY_COUNT_HAM => $internals[\b8\b8::KEY_TEXTS_HAM], + \b8\b8::KEY_COUNT_SPAM => $internals[\b8\b8::KEY_TEXTS_SPAM] ]); + + $this->finish_transaction(); + } + +} diff --git a/serendipity_event_spamblock_bayes/bayesAnalysis.tpl b/serendipity_event_spamblock_bayes/bayesAnalysis.tpl deleted file mode 100644 index ff9f00ba..00000000 --- a/serendipity_event_spamblock_bayes/bayesAnalysis.tpl +++ /dev/null @@ -1,27 +0,0 @@ -
-{foreach from=$comments item=comment} -

{$CONST.COMMENT} #{$comment.id}

- -
    - {foreach from=$types item=type} -
  • -
    {$type}
    -
    {$comment.$type|escape:"html"}
    -
    - {if $comment.ratings.$type != "-"} - {$comment.ratings.$type|regex_replace:"/\..*/":""}% - {else} - {$comment.ratings.$type} - {/if} -
    -
  • - {/foreach} -
- -
{$comment.rating|regex_replace:"/\..*/":""}%
-{/foreach} - - - -
\ No newline at end of file diff --git a/serendipity_event_spamblock_bayes/bayesAnalysismenu.tpl b/serendipity_event_spamblock_bayes/bayesAnalysismenu.tpl deleted file mode 100644 index 4fca1498..00000000 --- a/serendipity_event_spamblock_bayes/bayesAnalysismenu.tpl +++ /dev/null @@ -1,73 +0,0 @@ -
-
- {if $s9ybackend == 1} -
- {else} -
- {else} - - {/if} - -
-
    - {foreach from=$comments item=comment } -
  • - - -
    - {$comment.author|escape:"html"}, {$comment.body|escape:"html"} -
    -
  • - {/foreach} - -
-
- - - - - {if $s9ybackend == 1} -
- {else} -
- {else} - - {/if} -
-
\ No newline at end of file diff --git a/serendipity_event_spamblock_bayes/bayesDBmenu.tpl b/serendipity_event_spamblock_bayes/bayesDBmenu.tpl deleted file mode 100644 index e81a67c8..00000000 --- a/serendipity_event_spamblock_bayes/bayesDBmenu.tpl +++ /dev/null @@ -1,112 +0,0 @@ -
-
-
- -
-
- -
-
- -
-
- -
-
- -
-
- -
- - - - - - - - - - - - {foreach from=$bayesTable item=row} - - {foreach from=$row item=value} - - {/foreach} - - {/foreach} - -
{$CONST.PLUGIN_EVENT_SPAMBLOCK_BAYES_MENU_DATABASE}
tokenhamspamtype
- {$value} -
- {if $pages > 1} -
- {if $curpage > 2} - 1 - ... - {elseif $curpage > 1} - 1 - {/if} - - {section name=page start=1 loop=$pages+1} - {if $curpage == $smarty.section.page.index -1} - {$smarty.section.page.index} - {/if} - {if $curpage == $smarty.section.page.index -2 || $curpage == $smarty.section.page.index} - {$smarty.section.page.index} - {/if} - {/section} - - {if $curpage < $pages -3} - ... - {$pages} - {elseif $curpage < $pages -2} - {$pages} - {/if} -
- {/if} - -
- - - - - - - - - - - - - {section name=i loop=6 start=0} - - - {/section} - - - - - - - - - - - - - - - - - - -
{$CONST.PLUGIN_EVENT_SPAMBLOCK_BAYES_SAVEDVALUES}
{$CONST.NAME}{$CONST.HOMEPAGE}{$CONST.EMAIL}{$CONST.IP}{$CONST.REFERER}{$CONST.COMMENT}
{$CONST.PLUGIN_EVENT_SPAMBLOCK_BAYES_HAM}{$CONST.PLUGIN_EVENT_SPAMBLOCK_BAYES_SPAM}
{$author_ham}{$author_spam}{$url_ham}{$url_spam}{$email_ham}{$email_spam}{$ip_ham}{$ip_spam}{$referer_ham}{$referer_spam}{$body_ham}{$body_spam}
-
- - - - -
diff --git a/serendipity_event_spamblock_bayes/bayesImportmenu.tpl b/serendipity_event_spamblock_bayes/bayesImportmenu.tpl deleted file mode 100644 index ed2f599c..00000000 --- a/serendipity_event_spamblock_bayes/bayesImportmenu.tpl +++ /dev/null @@ -1,31 +0,0 @@ -
- {if $s9ybackend == 1}

{else} {/if}{$CONST.PLUGIN_EVENT_SPAMBLOCK_BAYES_IMPORT_EXPLANATION}{if $s9ybackend == 1}

{else}{/if} - -
- {if $s9ybackend != 1} -
- {/if} - - - {if $s9ybackend != 1} -
- {/if} -
- -

{$CONST.PLUGIN_EVENT_SPAMBLOCK_BAYES_TROJA}

- - {if $s9ybackend == 1}

{else} {/if}{$CONST.PLUGIN_EVENT_SPAMBLOCK_BAYES_TROJA_EXPLANATION}{if $s9ybackend == 1}

{else}{/if} - - - - - {if $trojaRegistered} -
- -
- {else} -
- -
- {/if} -
\ No newline at end of file diff --git a/serendipity_event_spamblock_bayes/bayesLearnmenu.tpl b/serendipity_event_spamblock_bayes/bayesLearnmenu.tpl deleted file mode 100644 index 8f6cd4b8..00000000 --- a/serendipity_event_spamblock_bayes/bayesLearnmenu.tpl +++ /dev/null @@ -1,86 +0,0 @@ -
-
- {if $s9ybackend == 1} - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
- - -
- - {else} -
- - -
- -
- - -
- -
- - -
- -
- - -
- -
- - -
- -
- - -
- -
-
- - -
- -
- - -
-
- -
- -
- {/if} -
-
diff --git a/serendipity_event_spamblock_bayes/bayesNavigation.tpl b/serendipity_event_spamblock_bayes/bayesNavigation.tpl deleted file mode 100644 index bdd67e62..00000000 --- a/serendipity_event_spamblock_bayes/bayesNavigation.tpl +++ /dev/null @@ -1,32 +0,0 @@ -{if $jquery_needed == true} - -{/if} - - -{if $s9ybackend != 1} -

{$CONST.PLUGIN_EVENT_SPAMBLOCK_BAYES_NAME}

-{/if} -
- -
diff --git a/serendipity_event_spamblock_bayes/bayesRecyclermenu.tpl b/serendipity_event_spamblock_bayes/bayesRecyclermenu.tpl index 58c446e1..cad25053 100644 --- a/serendipity_event_spamblock_bayes/bayesRecyclermenu.tpl +++ b/serendipity_event_spamblock_bayes/bayesRecyclermenu.tpl @@ -1,121 +1,39 @@ +

{$CONST.PLUGIN_EVENT_SPAMBLOCK_BAYES_NAME}

+
-
+
- {if $s9ybackend == 1} -
- {else} -
- {else} - - {/if} - {if $s9ybackend != 1} -
    - {/if} - {foreach from=$comments item=comment} - {if $s9ybackend == 1} -
    - - - - - - - - - - - - - - - - - - - - -
    {$CONST.AUTHOR}{$CONST.COMMENT}{$CONST.DATE}{$CONST.PLUGIN_EVENT_SPAMBLOCK_BAYES_RATING}
    {$comment.author|truncate:20:"..."|escape:"html"}{$comment.body|truncate:20:"..."|escape:"html"}{$comment.timestamp|date_format:"%d.%m.%y, %R"}{$comment.rating|regex_replace:"/\..*/":""}%
    -
    -
    - {foreach from=$types item=type} -
    {$type}
    -
    {$comment.$type|escape:"html"}
    - {/foreach} -
    {$CONST.Article}
    -
    {$comment.article_title}
    -
    -
    - {else} +
      + {foreach from=$comments item=comment}
    • -

      {$comment.author|truncate:20:"..."|escape:"html"} {$CONST.IN_REPLY_TO} {$comment.article_title} {$CONST.ON} {$comment.timestamp|date_format:"%d.%m.%y, %R"} – {$comment.rating|regex_replace:"/\..*/":""}%

      +

      {$comment.author|truncate:20:"..."|escape:"html"} {$CONST.IN_REPLY_TO} {$comment.article_title} {$CONST.ON} {$comment.timestamp|date_format:"%d.%m.%y, %R"}

      - {foreach from=$types item=type} -
      {$type}
      -
      {$comment.$type|escape:"html"}
      - {/foreach} +
      name
      +
      {$comment.author|escape:"html"}
      +
      email
      +
      {$comment.email|escape:"html"}
      +
      url
      +
      {$comment.url|escape:"html"}
      +
      comment
      +
      {$comment.body|escape:"html"}
    • - {/if} {/foreach} - {if $s9ybackend == 1} -
      - {else}
    -
- {else} - - {/if}
-{if $s9ybackend == 1} - -{/if}
\ No newline at end of file diff --git a/serendipity_event_spamblock_bayes/bayes_commentlist.js b/serendipity_event_spamblock_bayes/bayes_commentlist.js index 4590b7b7..1f6e3fb8 100644 --- a/serendipity_event_spamblock_bayes/bayes_commentlist.js +++ b/serendipity_event_spamblock_bayes/bayes_commentlist.js @@ -1,302 +1,11 @@ -var httpRequest; -var lastID; - -function ham(id) { - if (window.XMLHttpRequest) { // Mozilla, Safari, Opera, IE7 - httpRequest = new XMLHttpRequest(); - } else if (window.ActiveXObject) { // IE6, IE5 - httpRequest = new ActiveXObject("Microsoft.XMLHTTP"); - } - var copyId = id; - httpRequest.onreadystatechange = function() { - setMessage(copyId); - } - lastID = id; - // Method, url, Async = true / Sync = false - httpRequest.open('POST', learncommentPath, true); - httpRequest.setRequestHeader('content-Type', 'application/x-www-form-urlencoded; charset='+bayesCharset); - if (id.constructor == Array) { - var length = id.length - for (var i=0;i");g.text(d[f].textContent).hide(),a(d[f]).after(g),d[f].textContent="",e++}a.each(d,function(d,e){if(a(e)[0].nodeType==1&&e==a(e).parent().find("> summary:first-of-type")[0])a(e).data("processed")!=!0&&(a(e).css({display:"block",cursor:"pointer"}).data("processed",!0).addClass("detailHidden").bind("click",function(){b.toggleDetailChildren(a(this))}),a(c).prepend(a(e)));else if(a(e)[0].nodeType==3&&!e.isElementContentWhitespace&&!!a.browser.safari==!1){var f=a("");f.text(e.textContent).hide(),a(e).after(f),e.textContent=""}else if(a(c).find("> summary").length==0){var g=a("").text("Details").css({display:"block",cursor:"pointer"}).data("processed",!0).addClass("detailHidden").bind("click",function(){b.toggleDetailChildren(a(this))});a(c).prepend(g)}a(c).find("> :visible:not(summary:first-child)").hide()})},this.showDetailChildren=function(b){a(b).attr("open",!0),a.each(a(b).find("> *"),function(b,c){a(c).show()})},this.toggleDetailChildren=function(a){a.hasClass("detailHidden")?(a.removeClass("detailHidden"),b.showDetailChildren(a.parents("details")[0])):(a.addClass("detailHidden"),b.hideDetailChildren(a.parents("details")[0]))};var c=function(a){var b=a.createElement("details"),c,d,e;return"open"in b?(d=a.body||function(){var b=a.documentElement;return c=!0,b.insertBefore(a.createElement("body"),b.firstElementChild||b.firstChild)}(),b.innerHTML="ab",b.style.display="block",d.appendChild(b),e=b.offsetHeight,b.open=!0,e=e!=b.offsetHeight,d.removeChild(b),c&&d.parentNode.removeChild(d),e):!1}(document);if(c==!1){if(a("details").length!==0){var d=a(" - - - "; - return true; - break; - - case 'backend_sidebar_entries': - if (!serendipity_checkPermission('adminComments')) { - break; - } - if ($serendipity['version'][0] == 1) { - if ($this->get_config('menu', true)) { - echo ''; - } - } else { - } - return true; - break; + case 'xmlrpc_comment_ham': + $comment_id = $addData['cid']; + $entry_id = $addData['id']; + $comment = eventData['url'] . ' ' . $eventData['body'] . ' ' . $eventData['name'] . ' ' . $eventData['email']; + $this->learn($comment, 'ham'); + //moderated ham-comments should be instantly approved, that's why they need an id: + serendipity_approveComment($comment_id, $entry_id); + break; case 'backend_sidebar_admin_appearance': if (!serendipity_checkPermission('adminComments')) { break; } - if ($serendipity['version'][0] == 1) { - } else { - if ($this->get_config('menu', true)) { - echo '
  • ' . PLUGIN_EVENT_SPAMBLOCK_BAYES_NAME . '
  • '; - } - } - return true; + + echo ''; break; case 'backend_sidebar_entries_event_display_spamblock_bayes': if (!serendipity_checkPermission('adminComments')) { break; } - $path = $this->path = $this->get_config('path', $serendipity['serendipityHTTPPath'] . 'plugins/serendipity_event_spamblock_bayes/'); - if (!empty($path) && $path != 'default' && $path != 'none' && $path != 'empty') { - $path_defined = true; - $imgpath = $path . 'img/'; - } else { - $path_defined = false; - $imgpath = $serendipity['baseURL'] . 'index.php?/plugin/'; - } - global $serendipity; - if (isset($serendipity['GET']['message'])) { - if ($serendipity['version'][0] == 1) { - echo '

    '.(function_exists('serendipity_specialchars') ? serendipity_specialchars($serendipity['GET']['message']) : htmlspecialchars($serendipity['GET']['message'], ENT_COMPAT, LANG_CHARSET)).'

    '; - } else { - echo ' ' . (function_exists('serendipity_specialchars') ? serendipity_specialchars($serendipity['GET']['message']) : htmlspecialchars($serendipity['GET']['message'], ENT_COMPAT, LANG_CHARSET)) . ''; - } - } - if (isset($serendipity['GET']['success'])) { - if ($serendipity['version'][0] == 1) { - echo '

    '.(function_exists('serendipity_specialchars') ? serendipity_specialchars($serendipity['GET']['success']) : htmlspecialchars($serendipity['GET']['success'], ENT_COMPAT, LANG_CHARSET)).'

    '; - } else { - echo ' ' . (function_exists('serendipity_specialchars') ? serendipity_specialchars($serendipity['GET']['success']) : htmlspecialchars($serendipity['GET']['success'], ENT_COMPAT, LANG_CHARSET)) . ''; - } - } - if (isset($serendipity['GET']['error'])) { - if ($serendipity['version'][0] == 1) { - echo '

    '.(function_exists('serendipity_specialchars') ? serendipity_specialchars($serendipity['GET']['error']) : htmlspecialchars($serendipity['GET']['error'], ENT_COMPAT, LANG_CHARSET)).'

    '; - } else { - echo ' ' . (function_exists('serendipity_specialchars') ? serendipity_specialchars($serendipity['GET']['error']) : htmlspecialchars($serendipity['GET']['error'], ENT_COMPAT, LANG_CHARSET)) . ''; - } - } - $this->get = $serendipity['GET']; - - $this->displayMenu($serendipity['GET']['subpage']); - return true; + + $this->displayRecycler(); break; - case 'xmlrpc_comment_spam': - $entry_id = $addData['id']; - $comment_id = $addData['cid']; - if($this->get_config('method', 'moderate') == 'custom') { - $spamBarrier = min(array( - $this->get_config('moderateBarrier', 70) / 100, - $this->get_config('blockBarrier', 90) / 100 - )); - } else { - $spamBarrier = 0.7; - } - //spam shall not get through the filter twice - so make sure, it really is marked as spam - - $loop = 0; - while ($this->startClassify($eventData) < $spamBarrier && $loop < 5) { - $this->startLearn($eventData, 'spam'); - //prevent infinite loop - $loop++; - } - if ($this->get_config('recycler', true)) { - $this->recycleComment($comment_id, $entry_id); - } - serendipity_deleteComment($comment_id, $entry_id); - return true; - break; - - case 'xmlrpc_comment_ham': - $this->startLearn($eventData, 'ham'); - $comment_id = $addData['cid']; - $entry_id = $addData['id']; - //moderated ham-comments should be instantly approved, that's why they need an id: - serendipity_approveComment($comment_id, $entry_id); - return true; - break; - - + + case 'js_backend': + echo "var learncommentPath = '{$serendipity['baseURL']}index.php?/plugin/bayes_learncomment';"; + echo file_get_contents(dirname(__FILE__). '/bayes_commentlist.js'); + break; + + case 'css_backend': + echo '.spamblockBayesControls { cursor: pointer; }'; + break; default : return false; - break; } + return true; } else { return false; } } - function getRequest($url) { - if (function_exists('curl_init')) { - $ch = curl_init($url); - curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1); - $response = curl_exec($ch); - curl_close ($ch); - } else { - $options = array('http' => array( - 'method' => 'GET' - )); - $context = stream_context_create($options); - $response = file_get_contents($url, false, $context); - } - return $response; - } - - #Show the whole additional configuration, specifiy subpage for a specific tab - function displayMenu($subpage=0) { + # we init b8 in this function and not directly in the event hook, because in the event hook the SPL autoload gets triggered by smarty and fails + function initB8() { global $serendipity; - - $css = file_get_contents(dirname(__FILE__). '/admin/serendipity_event_spamblock_bayes.css'); - #add javascript for usability - if ($serendipity['capabilities']['jquery']) { - $jquery_needed = false; - } else { - $jquery_needed = true; - } - - echo $this->smarty_show('bayesNavigation.tpl', array('css' => $css, - 'jquery_needed' => $jquery_needed, - 'path' => $this->path, - 'subpage' => $subpage, - 's9ybackend' => $GLOBALS['s9ybackend'] - )); - - switch($subpage) { - case '1': - $this->showRecyclerMenu($this->get['commentpage']); + if ($this->$b8 === null) { + require_once(dirname(__FILE__) . '/b8/b8.php'); + switch ($serendipity['dbType']) { + case 'mysql': + case 'mysqli': + $config_b8 = [ 'storage' => 'mysql' ]; break; - case '2': - $this->showDBMenu($this->get['commentpage']); + case 'sqlite': + case 'sqlite3': + case 'pdo-sqlite': + case 'pdo-sqliteoo': + $config_b8 = [ 'storage' => 'sqlite' ]; break; - case '3': - $this->showLearnMenu(); - break; - case '4': - $this->showAnalysisMenu($this->get['commentpage']); - break; - case '5': - $this->showImportMenu(); - break; - default: - break; - } - } - - /* Render a smarty-template - * $template: path to the template-file - * $data: map with the variables to assign - * */ - function smarty_show($template, $data = null) { - global $serendipity; - - if (!is_object($serendipity['smarty'])) { - serendipity_smarty_init(); - } - - $serendipity['smarty']->assign($data); - echo $this->parseTemplate($template); - } - - function showLearnMenu() { - echo $this->smarty_show('bayesLearnmenu.tpl', array('s9ybackend' => $GLOBALS['s9ybackend'])); - } - - function showDBMenu($commentpage) { - global $serendipity; - $data = array(); - - $sql = "SELECT - token, ham, spam, type - FROM - {$serendipity['dbPrefix']}spamblock_bayes ORDER BY spam" . serendipity_db_limit_sql(sprintf("%d,%d", $commentpage*20, 20)); - try { - $bayesTable = serendipity_db_query($sql, false, "assoc"); - } catch (Exception $e) { - $bayesTable = array(); - } - try { - $sql ="SELECT COUNT(token) FROM {$serendipity['dbPrefix']}spamblock_bayes"; - $amount = serendipity_db_query($sql, true, "num"); - $amount = $amount[0]; - } catch (Exception $e) { - $amount = 0; - } - - - $data['pages'] = ceil($amount / 20); - $data['bayesTable'] = $bayesTable; - if (! isset($commentpage)) { - $commentpage = 0; - } - $data['curpage'] = $commentpage; - - foreach($this->type as $type) { - $data[$type.'_ham'] = $this->get_config("{$type}_ham", 0); - $data[$type.'_spam'] = $this->get_config("{$type}_spam", 0); - } - - $data['path'] = $this->path; - $data['s9ybackend'] = $GLOBALS['s9ybackend']; - echo $this->smarty_show('bayesDBmenu.tpl', $data); - } - - function showRecyclerMenu($commentpage) { - $comments = $this->getAllRecyclerComments($commentpage); - if (is_array($comments[0])) { - for ($i=0; $i < count($comments); $i++) { - $comment = $comments[$i]; - - $types = array_keys($this->type); - $ratings = array(); - - $comment['rating'] = $this->startClassify($comment) * 100; - $comment['article_link'] = serendipity_archiveURL($comment['entry_id'], 'comments', 'serendipityHTTPPath', true); - $comment['article_title'] = $this->getEntryTitle($comment['entry_id']); - $comments[$i] = $comment; - } - } else { - $comments = array(); - } - echo $this->smarty_show('bayesRecyclermenu.tpl', array('comments' => $comments, - 'types' => array_values($this->type), - 'commentpage' => $commentpage, - 'path' => $this->path, - 's9ybackend' => $GLOBALS['s9ybackend'] - )); - } - - function getEntryTitle($id) { - global $serendipity; - $sql = "SELECT title FROM {$serendipity['dbPrefix']}entries WHERE id = '$id'"; - $title = serendipity_db_query($sql, true, "assoc"); - $title = $title['title']; - return $title; - } - - function showAnalysisMenu($commentpage=0) { - if (isset($this->get['comments'])) { - //comments already were selected - $comment_ids = array_keys($this->get['comments']); - $this->showAnalysis($comment_ids); - } else { - $comments = $this->getAllComments($commentpage); - if (!is_array($comments[0])) { - $comments = array(); - } - echo $this->smarty_show('bayesAnalysismenu.tpl', array( - 'comments' => $comments, - 'commentpage' => $commentpage, - 'path' => $this->path, - 's9ybackend' => $GLOBALS['s9ybackend'] - )); + + $config_storage = [ 'resource' => $serendipity['dbConn'], + 'table' => 'b8_wordlist' ]; + $this->$b8 = new b8\b8($config_b8, $config_storage); } } - function showImportMenu() { - global $serendipity; - echo $this->smarty_show('bayesImportmenu.tpl', array( - 'trojaRegistered' => $this->get_config('troja_registered', false) == true, - 's9ybackend' => $GLOBALS['s9ybackend'] - )); + # Return the bayes rating reflecting the spamminess of the comment string. 0: ham, 1: spam + function rate($comment) { + $this->initB8(); + return $this->$b8->classify($comment); } - function showAnalysis($comment_id) { - $comments = $this->getComment($comment_id); - for ($i=0; $i < count($comments); $i++) { - $comment = $comments[$i]; - - $types = array_keys($this->type); - $ratings = array(); - - foreach($types as $type) { - $rating = $this->classify($comment[$this->type[$type]], $this->type[$type]); - - if (is_numeric($rating)) { - $ratings[$this->type[$type]] = $rating * 100; - } else { - $ratings[$this->type[$type]] = '-'; - } - } - $comment['rating'] = $this->startClassify($comment) * 100; - $comment['ratings'] = $ratings; - $comments[$i] = $comment; + # Mark a comment text as ham or spam + function learn($comment, $category) { + $this->initB8(); + if ($category == 'ham') { + $this->$b8->learn($comment, b8\b8::HAM); } - echo $this->smarty_show('bayesAnalysis.tpl', array('comments' => $comments, - 'types' => array_values($this->type), - 's9ybackend' => $GLOBALS['s9ybackend'] - )); - } - - #For email-notification. Learn a spam or ham and delete or approve. - function learnAction($id, $category, $action, $entry_id) { - - global $serendipity; - - $comment = $this->getComment($id); - if (is_array ($comment)) { - $comment = $comment['0']; + if ($category == 'spam') { + $this->$b8->learn($comment, b8\b8::SPAM); } - - $this->startLearn($comment, $category); - - # This generates a new Token if this function is called via e-mail comment token - # this should be a function in the core. It will be replaced if the core-modification is live. - if ($serendipity['useCommentTokens']) { - $token = md5(uniqid(rand(),1)); - $path = $path . "_token_" . $token; - //Delete any comment tokens older than 1 week. - serendipity_db_query("DELETE FROM {$serendipity['dbPrefix']}options - WHERE okey LIKE 'comment_%' AND name < " . (time() - 604800) ); - // Issue new comment moderation hash - serendipity_db_query("INSERT INTO {$serendipity['dbPrefix']}options (name, value, okey) - VALUES ('" . time() . "', '" . $token . "', 'comment_" . $id ."')"); - } - - - - - if ($action == 'delete') { - serendipity_deleteComment($id, $entry_id, 'comment', $token); - } else if ($action == 'approve') { - serendipity_approveComment($id, $entry_id, 'comment', $token); - } - } - - #id: array of ids or a single id - function getComment($id) { - global $serendipity; - - if(is_array($id)) { - $sql = "SELECT id, body, entry_id, author, email, url, ip, referer FROM {$serendipity['dbPrefix']}comments - WHERE " . serendipity_db_in_sql ( 'id', $id ); - } else { - $sql = "SELECT id, body, entry_id, author, email, url, ip, referer FROM {$serendipity['dbPrefix']}comments - WHERE id = " . (int)$id; - } - $comments = serendipity_db_query($sql, false, 'assoc'); - return $comments; - } - - #id: array of ids or a single id - function getRecyclerComment($id) { - global $serendipity; - - if(is_array($id)) { - $sql = "SELECT id, body, entry_id, author, email, url, ip, referer FROM {$serendipity['dbPrefix']}spamblock_bayes_recycler - WHERE " . serendipity_db_in_sql ( 'id', $id ); - } else { - $sql = "SELECT id, body, entry_id, author, email, url, ip, referer FROM {$serendipity['dbPrefix']}spamblock_bayes_recycler - WHERE id = " . (int)$id; - } - $comments = serendipity_db_query($sql, false, 'assoc'); - - return $comments; - } - - # Get all comments, or, when $page was given, give 20 comments of - # that page - function getAllComments($page=false) { - global $serendipity; - if ($page === false) { - $sql = "SELECT * FROM {$serendipity['dbPrefix']}comments ORDER BY id DESC"; - } else { - $first = $page * 20; - $amount = 21; - $sql = "SELECT * FROM {$serendipity['dbPrefix']}comments ORDER BY id DESC" . serendipity_db_limit_sql(sprintf("%d,%d", $first, $amount)); - } - $comments = serendipity_db_query($sql, false, 'assoc'); - - return $comments; - } - - function getAllRecyclerComments($page=false) { - global $serendipity; - if ($page === false) { - $sql = "SELECT * FROM {$serendipity['dbPrefix']}spamblock_bayes_recycler ORDER BY id DESC"; - } else { - $first = $page * 20; - $amount = 21; - $sql = "SELECT * FROM {$serendipity['dbPrefix']}spamblock_bayes_recycler ORDER BY id DESC" . serendipity_db_limit_sql(sprintf("%d,%d", $first, $amount)); - } - $comments = serendipity_db_query($sql, false, 'assoc'); - - return $comments; } function block(&$eventData, &$addData) { global $serendipity; if ($this->get_config('recycler', true)) { - $delete = $this->get_config('recyclerdelete', ''); - $rating = preg_replace('/\..*/', '', $this->lastRating * 100); - if (empty($delete) || $rating < $delete) { - $this->throwInRecycler($eventData, $addData); - } + $this->throwInRecycler($eventData, $addData); } - $logfile = $this->logfile = $this->get_config('logfile', $serendipity['serendipityPath'] . 'spamblock.log'); - $this->log($logfile, $eventData['id'], 'REJECTED', PLUGIN_EVENT_SPAMBLOCK_BAYES_REASON, $addData); - $eventData = array ('allow_comments' => false); - $serendipity ['messagestack'] ['comments'] [] = PLUGIN_EVENT_SPAMBLOCK_BAYES_ERROR; + $eventData['allow_comments'] = false; + $serendipity['messagestack']['comments'][] = PLUGIN_EVENT_SPAMBLOCK_BAYES_ERROR; } function moderate(&$eventData, &$addData) { global $serendipity; - $logfile = $this->logfile = $this->get_config('logfile', $serendipity['serendipityPath'] . 'spamblock.log'); - $this->log($logfile, $eventData['id'], 'MODERATE', PLUGIN_EVENT_SPAMBLOCK_BAYES_REASON, $addData); $eventData['moderate_comments'] = true; $serendipity['csuccess'] = 'moderate'; $serendipity['moderate_reason'] = sprintf(PLUGIN_EVENT_SPAMBLOCK_BAYES_MODERATE); } + # id: id of a comment + function getComment($id) { + global $serendipity; + + $sql = "SELECT id, body, entry_id, author, email, url, ip, referer FROM {$serendipity['dbPrefix']}comments + WHERE id = " . (int)$id; + + $comments = serendipity_db_query($sql, false, 'assoc'); + return $comments; + } + + ### Recycler functionality ### + + function displayRecycler() { + global $serendipity; + $comments = $this->getAllRecyclerComments(); + if (is_array($comments[0])) { + for ($i=0; $i < count($comments); $i++) { + $databaseComment = $comments[$i]; + $comment = $databaseComment['url'] . ' ' . $databaseComment['body'] . ' ' . $databaseComment['author'] . ' ' . $databaseComment['email']; + + $databaseComment['article_link'] = serendipity_archiveURL($databaseComment['entry_id'], 'comments', 'serendipityHTTPPath', true); + $databaseComment['article_title'] = $this->getEntryTitle($databaseComment['entry_id']); + $comments[$i] = $databaseComment; + + } + } else { + $comments = array(); + } + if (!is_object($serendipity['smarty'])) { + serendipity_smarty_init(); + } + $serendipity['smarty']->assign('comments', $comments); + echo $this->parseTemplate('bayesRecyclermenu.tpl'); + } + + function getAllRecyclerComments() { + global $serendipity; + $sql = "SELECT * FROM {$serendipity['dbPrefix']}spamblock_bayes_recycler ORDER BY id DESC"; + $comments = serendipity_db_query($sql, false, 'assoc'); + + return $comments; + } + //Empty the Recycler function emptyRecycler() { global $serendipity; @@ -1833,301 +529,12 @@ class serendipity_event_spamblock_bayes extends serendipity_event { return serendipity_db_query($sql); } - /** - * Export the database spamblack_bayes into a csv-file - * */ - function exportDatabase() { + function getEntryTitle($id) { global $serendipity; - - #try to reduce memory usage by not selecting the whole table, - #but splitting it in chunks of 10000 - - $sql = "SELECT COUNT(*) - FROM - {$serendipity['dbPrefix']}spamblock_bayes"; - $amount = serendipity_db_query($sql); - $amount = $amount[0][0]; - - $runs = 0; - $csvfile = $serendipity ['serendipityPath'] . 'templates_c/spamblock_bayes.csv'; - $fp = @fopen($csvfile , 'w'); - while ($amount > ($start = $runs * 10000)) { - $sql = "SELECT - token, ham, spam, type - FROM - {$serendipity['dbPrefix']}spamblock_bayes - LIMIT $start, 10000"; - $database = serendipity_db_query($sql); - - #The array $database now contains all results twice. There's - #probably a nicer way to remove them - for ($i=0;$i < count($database); $i++) { - for ($j=0;$j < 4; $j++) { - unset($database[$i][$j]); - } - } - foreach ($database as $fields) { - fputcsv($fp, $fields); - } - $runs++; - } - fclose($fp); + $sql = "SELECT title FROM {$serendipity['dbPrefix']}entries WHERE id = '$id'"; + $title = serendipity_db_query($sql, true, "assoc"); + $title = $title['title']; + return $title; } - function fetchDatabase($host, $key) { - global $serendipity; - $data = array('key' => $key); - $url = $host . 'index.php?/plugin/bayesExportDatabase'; - if (function_exists('curl_init')) { - $ch = curl_init($url); - curl_setopt($ch, CURLOPT_POST, true); - curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1); - curl_setopt($ch, CURLOPT_POSTFIELDS, http_build_query($data)); - $result = curl_exec ($ch); - curl_close ($ch); - } else { - // this method should work, but in my test, this code - //never transmitted the post-fields properly - $options = array('http' => array( - 'method' => 'POST', - 'content' => http_build_query($data) - )); - $context = stream_context_create($options); - $result = file_get_contents($url, false, $context); - } - - if( $this->validCvs($result)) { - #write obtained csv to $file - $csvfile = $serendipity ['serendipityPath'] . 'templates_c/spamblock_bayes.csv'; - file_put_contents($csvfile, $result); - $spamDB = $this->getCsvDatabase($csvfile); - - $this->importDatabase($spamDB); - } - } - - #check if the fetched page really was a spamblock-file - #param1: $content Content of the cvs - #return: true or false - function validCvs($content) { - $lines = explode("\n", $content); - $number_lines = count($lines) -1; - return preg_match_all("/.*,[0-9]*,[0-9]*,.*/", $content, $matches) == $number_lines; - } - - function importDatabase($importDatabase) { - global $serendipity; - set_time_limit(0); - serendipity_db_begin_transaction(); - if ($this->get_config('dbversion', 2) == 3 - && - ($serendipity['dbType'] == 'mysql' - || $serendipity['dbType'] == 'mysqli')) { - #now there is a primary key we can use - foreach ($importDatabase as $importToken) { - $token = $importToken[0]; - $ham = $importToken[1]; - $spam = $importToken[2]; - $type = $importToken[3]; - $sql = "INSERT INTO - {$serendipity['dbPrefix']}spamblock_bayes - (token, ham, spam, type) - VALUES - ('$token', $ham, $spam, '$type') - ON DUPLICATE KEY - UPDATE - ham = ham + VALUES(ham), - spam = spam + VALUES(spam);"; - - serendipity_db_query($sql); - $result = mysql_error(); - if ($result != "") { - serendipity_db_end_transaction(false); - return $result; - } - if ($ham > 0) { - $this->set_config("{$type}_ham", $this->get_config("{$type}_ham", 0) + 1); - } - if ($spam > 0) { - $this->set_config("{$type}_spam", $this->get_config("{$type}_spam", 0) + 1); - } - } - } elseif ($serendipity['dbType'] == 'sqlite' || $serendipity['dbType'] == 'sqlite3' || $serendipity['dbType'] == 'sqlite3oo' || $serendipity['dbType'] == 'pdo-sqlite') { - foreach ($importDatabase as $importToken) { - $token = $importToken[0]; - $ham = $importToken[1]; - $spam = $importToken[2]; - $type = $importToken[3]; - $sql = "INSERT OR IGNORE INTO - {$serendipity['dbPrefix']}spamblock_bayes - (token, ham, spam, type) - VALUES - ('$token', 0, 0, '$type');"; - serendipity_db_query($sql); - $sql = "UPDATE - {$serendipity['dbPrefix']}spamblock_bayes - SET - ham = ham + $ham, spam = spam + $spam - WHERE - token = '$token' AND type = '$type'"; - serendipity_db_query($sql); - if ($ham > 0) { - $this->set_config("{$type}_ham", $this->get_config("{$type}_ham", 0) + 1); - } - if ($spam > 0) { - $this->set_config("{$type}_spam", $this->get_config("{$type}_spam", 0) + 1); - } - } - } else { - foreach ($importDatabase as $importToken) { - $token = $importToken[0]; - $ham = $importToken[1]; - $spam = $importToken[2]; - $type = $importToken[3]; - $sql = "SELECT - token - FROM - {$serendipity['dbPrefix']}spamblock_bayes - WHERE - token = '$token' AND type = '$type'"; - - $tester = serendipity_db_query($sql); - - if (empty($tester[0])) { - $sql = "INSERT INTO - {$serendipity['dbPrefix']}spamblock_bayes - (token, ham, spam, type) - VALUES('$token', $ham, $spam, '$type')"; - } else { - $sql = "UPDATE {$serendipity['dbPrefix']}spamblock_bayes - SET - ham = ham + $ham, - spam = spam + $spam - WHERE token = '$token' AND type = '$type'"; - } - - serendipity_db_query($sql); - #NOTE: We do this wrongly, but as good as possible (really?). - # The config is supposed to store the amount of - # ham/spam-comments, not a guess of that. - if ($ham > 0) { - $this->set_config("{$type}_ham", $this->get_config("{$type}_ham", 0) + 1); - } - if ($spam > 0) { - $this->set_config("{$type}_spam", $this->get_config("{$type}_spam", 0) + 1); - } - } - } - serendipity_db_end_transaction(true); - - return true; - } - - function getCsvDatabase($csvfile) { - if (($handle = fopen($csvfile, "r")) !== FALSE) { - $i = 0; - while (($lineArray = fgetcsv($handle, 4000)) !== FALSE) { - for ($j=0; $jdebug_fp = @fopen ( $serendipity ['serendipityPath'] . 'templates_c/spamblock_bayes.log', 'a' ); - if (! $this->debug_fp) { - return false; - } - - if (empty ( $msg )) { - fwrite ( $this->debug_fp, "failure \n" ); - } else { - fwrite ( $this->debug_fp, print_r ( $msg, true ) ); - } - fclose ( $this->debug_fp ); - } - - function is_goodtoken($rpath, $cid) { - $tokenparse = explode("_",$rpath); - // check that we got a 32 char tokeni - if (is_array($tokenparse)) { - if (strlen($tokenparse[2]) == 32) { - $ret=serendipity_checkCommentToken($tokenparse[2], (int)$cid); - return $ret; - } else { - return false; - } - } else { - return false; - } - } - - function log($logfile, $id, $switch, $reason, $addData) { - global $serendipity; - $method = $this->get_config('logtype'); - - switch($method) { - case 'file': - - if (empty($logfile)) { - return; - } - if (strpos($logfile, '%') !== false) { - $logfile = strftime($logfile); - } - - $fp = @fopen($logfile, 'a+'); - if (!is_resource($fp)) { - return; - } - fwrite($fp, sprintf( - '[%s] - [%s: %s] - [#%s, Name "%s", E-Mail "%s", URL "%s", User-Agent "%s", IP %s] - [%s]' . "\n", - date('Y-m-d H:i:s', serendipity_serverOffsetHour()), - $switch, - $reason, - $id, - str_replace("\n", ' ', $addData['name']), - str_replace("\n", ' ', $addData['email']), - str_replace("\n", ' ', $addData['url']), - str_replace("\n", ' ', $_SERVER['HTTP_USER_AGENT']), - $_SERVER['REMOTE_ADDR'], - str_replace("\n", ' ', $addData['comment']) - )); - - fclose($fp); - break; - - case 'none': - return; - break; - - case 'db': - default: - $q = sprintf("INSERT INTO {$serendipity['dbPrefix']}spamblocklog - (timestamp, type, reason, entry_id, author, email, url, useragent, ip, referer, body) - VALUES (%d, '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s')", - - serendipity_serverOffsetHour(), - serendipity_db_escape_string($switch), - serendipity_db_escape_string($reason), - serendipity_db_escape_string($id), - serendipity_db_escape_string($addData['name']), - serendipity_db_escape_string($addData['email']), - serendipity_db_escape_string($addData['url']), - substr(serendipity_db_escape_string($_SERVER['HTTP_USER_AGENT']), 0, 255), - serendipity_db_escape_string($_SERVER['REMOTE_ADDR']), - substr(serendipity_db_escape_string(isset($_SESSION['HTTP_REFERER']) ? $_SESSION['HTTP_REFERER'] : $_SERVER['HTTP_REFERER']), 0, 255), - serendipity_db_escape_string($addData['comment']) - ); - - serendipity_db_schema_import($q); - break; - } - } }