additional_plugins/serendipity_event_spamblock_bayes/b8/b8.php
2021-07-18 13:23:44 +02:00

393 lines
14 KiB
PHP

<?php
/* SPDX-FileCopyrightText: 2006-2021 Tobias Leupold <tobias.leupold@gmx.de>
SPDX-FileCopyrightText: 2009 Oliver Lillie <ollie@buggedcom.co.uk>
SPDX-License-Identifier: LGPL-3.0-or-later
*/
/**
* The b8 spam filter library
*
* @package b8
*/
namespace b8;
spl_autoload_register(
function ($class) {
$parts = explode('\\', $class);
if (count($parts) > 2 && $parts[0] == 'b8') {
require_once __DIR__ . DIRECTORY_SEPARATOR . $parts[1]
. DIRECTORY_SEPARATOR . $parts[2] . '.php';
}
}
);
class b8
{
const DBVERSION = 3;
const SPAM = 'spam';
const HAM = 'ham';
const LEARN = 'learn';
const UNLEARN = 'unlearn';
const CLASSIFIER_TEXT_MISSING = 'CLASSIFIER_TEXT_MISSING';
const TRAINER_TEXT_MISSING = 'TRAINER_TEXT_MISSING';
const TRAINER_CATEGORY_MISSING = 'TRAINER_CATEGORY_MISSING';
const TRAINER_CATEGORY_FAIL = 'TRAINER_CATEGORY_FAIL';
const INTERNALS_TEXTS = 'b8*texts';
const INTERNALS_DBVERSION = 'b8*dbversion';
const KEY_DB_VERSION = 'dbversion';
const KEY_COUNT_HAM = 'count_ham';
const KEY_COUNT_SPAM = 'count_spam';
const KEY_TEXTS_HAM = 'texts_ham';
const KEY_TEXTS_SPAM = 'texts_spam';
private $config = [ 'lexer' => 'standard',
'degenerator' => 'standard',
'storage' => 'dba',
'use_relevant' => 15,
'min_dev' => 0.2,
'rob_s' => 0.3,
'rob_x' => 0.5 ];
private $storage = null;
private $lexer = null;
private $degenerator = null;
private $token_data = null;
/**
* Constructs b8
*
* @access public
* @param array b8's configuration: [ 'lexer' => string,
'degenerator' => string,
'storage' => string,
'use_relevant' => int,
'min_dev' => float,
'rob_s' => float,
'rob_x' => float ]
* @param array The storage backend's config (depending on the backend used)
* @param array The lexer's config (depending on the lexer used)
* @param array The degenerator's config (depending on the degenerator used)
* @return void
*/
function __construct(array $config = [],
array $config_storage = [],
array $config_lexer = [],
array $config_degenerator = [])
{
// Validate config data
foreach ($config as $name => $value) {
switch ($name) {
case 'min_dev':
case 'rob_s':
case 'rob_x':
$this->config[$name] = (float) $value;
break;
case 'use_relevant':
$this->config[$name] = (int) $value;
break;
case 'lexer':
case 'degenerator':
case 'storage':
$this->config[$name] = (string) $value;
break;
default:
throw new \Exception(b8::class . ": Unknown configuration key: \"$name\"");
}
}
// Setup the degenerator class
$class = '\\b8\\degenerator\\' . $this->config['degenerator'];
$this->degenerator = new $class($config_degenerator);
// Setup the lexer class
$class = '\\b8\\lexer\\' . $this->config['lexer'];
$this->lexer = new $class($config_lexer);
// Setup the storage backend
$class = '\\b8\\storage\\' . $this->config['storage'];
$this->storage = new $class($config_storage, $this->degenerator);
}
/**
* Classifies a text
*
* @access public
* @param string The text to classify
* @return mixed float The rating between 0 (ham) and 1 (spam) or an error code
*/
public function classify(string $text = null)
{
// Let's first see if the user called the function correctly
if ($text === null) {
return \b8\b8::CLASSIFIER_TEXT_MISSING;
}
// Get the internal database variables, containing the number of ham and spam texts so the
// spam probability can be calculated in relation to them
$internals = $this->storage->get_internals();
// Calculate the spaminess of all tokens
// Get all tokens we want to rate
$tokens = $this->lexer->get_tokens($text);
// Check if the lexer failed (if so, $tokens will be a lexer error code, if not, $tokens
// will be an array)
if (! is_array($tokens)) {
return $tokens;
}
// Fetch all available data for the token set from the database
$this->token_data = $this->storage->get(array_keys($tokens));
// Calculate the spaminess and importance for each token (or a degenerated form of it)
$word_count = [];
$rating = [];
$importance = [];
foreach ($tokens as $word => $count) {
$word_count[$word] = $count;
// Although we only call this function only here ... let's do the calculation stuff in a
// function to make this a bit less confusing ;-)
$rating[$word] = $this->get_probability($word, $internals);
$importance[$word] = abs(0.5 - $rating[$word]);
}
// Order by importance
arsort($importance);
reset($importance);
// Get the most interesting tokens (use all if we have less than the given number)
$relevant = [];
for ($i = 0; $i < $this->config['use_relevant']; $i++) {
if ($token = key($importance)) {
// Important tokens remain
// If the token's rating is relevant enough, use it
if (abs(0.5 - $rating[$token]) > $this->config['min_dev']) {
// Tokens that appear more than once also count more than once
for ($x = 0, $l = $word_count[$token]; $x < $l; $x++) {
array_push($relevant, $rating[$token]);
}
}
} else {
// We have less words as we want to use, so we already use what we have and can
// break here
break;
}
next($importance);
}
// Calculate the spaminess of the text (thanks to Mr. Robinson ;-)
// We set both haminess and spaminess to 1 for the first multiplying
$haminess = 1;
$spaminess = 1;
// Consider all relevant ratings
foreach ($relevant as $value) {
$haminess *= (1.0 - $value);
$spaminess *= $value;
}
// If no token was good for calculation, we really don't know how to rate this text, so
// we can return 0.5 without further calculations.
if ($haminess == 1 && $spaminess == 1) {
return 0.5;
}
// Calculate the combined rating
// Get the number of relevant ratings
$n = count($relevant);
// The actual haminess and spaminess
$haminess = 1 - pow($haminess, (1 / $n));
$spaminess = 1 - pow($spaminess, (1 / $n));
// Calculate the combined indicator
$probability = ($haminess - $spaminess) / ($haminess + $spaminess);
// We want a value between 0 and 1, not between -1 and +1, so ...
$probability = (1 + $probability) / 2;
// Alea iacta est
return $probability;
}
/**
* Calculate the spaminess of a single token also considering "degenerated" versions
*
* @access private
* @param string The word to rate
* @param array The "internals" array
* @return float The word's rating
*/
private function get_probability(string $word, array $internals)
{
// Let's see what we have!
if (isset($this->token_data['tokens'][$word])) {
// The token is in the database, so we can use it's data as-is and calculate the
// spaminess of this token directly
return $this->calculate_probability($this->token_data['tokens'][$word], $internals);
}
// The token was not found, so do we at least have similar words?
if (isset($this->token_data['degenerates'][$word])) {
// We found similar words, so calculate the spaminess for each one and choose the most
// important one for the further calculation
// The default rating is 0.5 simply saying nothing
$rating = 0.5;
foreach ($this->token_data['degenerates'][$word] as $degenerate => $count) {
// Calculate the rating of the current degenerated token
$rating_tmp = $this->calculate_probability($count, $internals);
// Is it more important than the rating of another degenerated version?
if(abs(0.5 - $rating_tmp) > abs(0.5 - $rating)) {
$rating = $rating_tmp;
}
}
return $rating;
} else {
// The token is really unknown, so choose the default rating for completely unknown
// tokens. This strips down to the robX parameter so we can cheap out the freaky math
// ;-)
return $this->config['rob_x'];
}
}
/**
* Do the actual spaminess calculation of a single token
*
* @access private
* @param array The token's data [ \b8\b8::KEY_COUNT_HAM => int,
\b8\b8::KEY_COUNT_SPAM => int ]
* @param array The "internals" array
* @return float The rating
*/
private function calculate_probability(array $data, array $internals)
{
// Calculate the basic probability as proposed by Mr. Graham
// But: consider the number of ham and spam texts saved instead of the number of entries
// where the token appeared to calculate a relative spaminess because we count tokens
// appearing multiple times not just once but as often as they appear in the learned texts.
$rel_ham = $data[\b8\b8::KEY_COUNT_HAM];
$rel_spam = $data[\b8\b8::KEY_COUNT_SPAM];
if ($internals[\b8\b8::KEY_TEXTS_HAM] > 0) {
$rel_ham = $data[\b8\b8::KEY_COUNT_HAM] / $internals[\b8\b8::KEY_TEXTS_HAM];
}
if ($internals[\b8\b8::KEY_TEXTS_SPAM] > 0) {
$rel_spam = $data[\b8\b8::KEY_COUNT_SPAM] / $internals[\b8\b8::KEY_TEXTS_SPAM];
}
$rating = $rel_spam / ($rel_ham + $rel_spam);
// Calculate the better probability proposed by Mr. Robinson
$all = $data[\b8\b8::KEY_COUNT_HAM] + $data[\b8\b8::KEY_COUNT_SPAM];
return (($this->config['rob_s'] * $this->config['rob_x']) + ($all * $rating))
/ ($this->config['rob_s'] + $all);
}
/**
* Check the validity of the category of a request
*
* @access private
* @param string The category
* @return void
*/
private function check_category(string $category)
{
return $category === \b8\b8::HAM || $category === \b8\b8::SPAM;
}
/**
* Learn a reference text
*
* @access public
* @param string The text to learn
* @param string Either b8::SPAM or b8::HAM
* @return mixed void or an error code
*/
public function learn(string $text = null, string $category = null)
{
// Let's first see if the user called the function correctly
if ($text === null) {
return \b8\b8::TRAINER_TEXT_MISSING;
}
if ($category === null) {
return \b8\b8::TRAINER_CATEGORY_MISSING;
}
return $this->process_text($text, $category, \b8\b8::LEARN);
}
/**
* Unlearn a reference text
*
* @access public
* @param string The text to unlearn
* @param string Either b8::SPAM or b8::HAM
* @return mixed void or an error code
*/
public function unlearn(string $text = null, string $category = null)
{
// Let's first see if the user called the function correctly
if ($text === null) {
return \b8\b8::TRAINER_TEXT_MISSING;
}
if ($category === null) {
return \b8\b8::TRAINER_CATEGORY_MISSING;
}
return $this->process_text($text, $category, \b8\b8::UNLEARN);
}
/**
* Does the actual interaction with the storage backend for learning or unlearning texts
*
* @access private
* @param string The text to process
* @param string Either b8::SPAM or b8::HAM
* @param string Either b8::LEARN or b8::UNLEARN
* @return mixed void or an error code
*/
private function process_text(string $text, string $category, string $action)
{
// Look if the request is okay
if (! $this->check_category($category)) {
return \b8\b8::TRAINER_CATEGORY_FAIL;
}
// Get all tokens from $text
$tokens = $this->lexer->get_tokens($text);
// Check if the lexer failed (if so, $tokens will be a lexer error code, if not, $tokens
// will be an array)
if (! is_array($tokens)) {
return $tokens;
}
// Pass the tokens and what to do with it to the storage backend
return $this->storage->process_text($tokens, $category, $action);
}
}