392 lines
14 KiB
PHP
392 lines
14 KiB
PHP
<?php
|
|
|
|
/* SPDX-FileCopyrightText: 2006-2021 Tobias Leupold <tobias.leupold@gmx.de>
|
|
SPDX-FileCopyrightText: 2009 Oliver Lillie <ollie@buggedcom.co.uk>
|
|
|
|
SPDX-License-Identifier: LGPL-3.0-or-later
|
|
*/
|
|
|
|
/**
|
|
* The b8 spam filter library
|
|
*
|
|
* @package b8
|
|
*/
|
|
|
|
namespace b8;
|
|
|
|
spl_autoload_register(
|
|
function ($class) {
|
|
$parts = explode('\\', $class);
|
|
if (count($parts) > 2 && $parts[0] == 'b8') {
|
|
require_once __DIR__ . DIRECTORY_SEPARATOR . $parts[1]
|
|
. DIRECTORY_SEPARATOR . $parts[2] . '.php';
|
|
}
|
|
}
|
|
);
|
|
|
|
class b8
|
|
{
|
|
const DBVERSION = 3;
|
|
|
|
const SPAM = 'spam';
|
|
const HAM = 'ham';
|
|
const LEARN = 'learn';
|
|
const UNLEARN = 'unlearn';
|
|
|
|
const CLASSIFIER_TEXT_MISSING = 'CLASSIFIER_TEXT_MISSING';
|
|
|
|
const TRAINER_TEXT_MISSING = 'TRAINER_TEXT_MISSING';
|
|
const TRAINER_CATEGORY_MISSING = 'TRAINER_CATEGORY_MISSING';
|
|
const TRAINER_CATEGORY_FAIL = 'TRAINER_CATEGORY_FAIL';
|
|
|
|
const INTERNALS_TEXTS = 'b8*texts';
|
|
const INTERNALS_DBVERSION = 'b8*dbversion';
|
|
|
|
const KEY_DB_VERSION = 'dbversion';
|
|
const KEY_COUNT_HAM = 'count_ham';
|
|
const KEY_COUNT_SPAM = 'count_spam';
|
|
const KEY_TEXTS_HAM = 'texts_ham';
|
|
const KEY_TEXTS_SPAM = 'texts_spam';
|
|
|
|
private $config = [ 'lexer' => 'standard',
|
|
'degenerator' => 'standard',
|
|
'storage' => 'dba',
|
|
'use_relevant' => 15,
|
|
'min_dev' => 0.2,
|
|
'rob_s' => 0.3,
|
|
'rob_x' => 0.5 ];
|
|
|
|
private $storage = null;
|
|
private $lexer = null;
|
|
private $degenerator = null;
|
|
private $token_data = null;
|
|
|
|
/**
|
|
* Constructs b8
|
|
*
|
|
* @access public
|
|
* @param array b8's configuration: [ 'lexer' => string,
|
|
'degenerator' => string,
|
|
'storage' => string,
|
|
'use_relevant' => int,
|
|
'min_dev' => float,
|
|
'rob_s' => float,
|
|
'rob_x' => float ]
|
|
* @param array The storage backend's config (depending on the backend used)
|
|
* @param array The lexer's config (depending on the lexer used)
|
|
* @param array The degenerator's config (depending on the degenerator used)
|
|
* @return void
|
|
*/
|
|
function __construct(array $config = [],
|
|
array $config_storage = [],
|
|
array $config_lexer = [],
|
|
array $config_degenerator = [])
|
|
{
|
|
// Validate config data
|
|
foreach ($config as $name => $value) {
|
|
switch ($name) {
|
|
case 'min_dev':
|
|
case 'rob_s':
|
|
case 'rob_x':
|
|
$this->config[$name] = (float) $value;
|
|
break;
|
|
case 'use_relevant':
|
|
$this->config[$name] = (int) $value;
|
|
break;
|
|
case 'lexer':
|
|
case 'degenerator':
|
|
case 'storage':
|
|
$this->config[$name] = (string) $value;
|
|
break;
|
|
default:
|
|
throw new \Exception(b8::class . ": Unknown configuration key: \"$name\"");
|
|
}
|
|
}
|
|
|
|
// Setup the degenerator class
|
|
$class = '\\b8\\degenerator\\' . $this->config['degenerator'];
|
|
$this->degenerator = new $class($config_degenerator);
|
|
|
|
// Setup the lexer class
|
|
$class = '\\b8\\lexer\\' . $this->config['lexer'];
|
|
$this->lexer = new $class($config_lexer);
|
|
|
|
// Setup the storage backend
|
|
$class = '\\b8\\storage\\' . $this->config['storage'];
|
|
$this->storage = new $class($config_storage, $this->degenerator);
|
|
}
|
|
|
|
/**
|
|
* Classifies a text
|
|
*
|
|
* @access public
|
|
* @param string The text to classify
|
|
* @return mixed float The rating between 0 (ham) and 1 (spam) or an error code
|
|
*/
|
|
public function classify(string $text = null)
|
|
{
|
|
// Let's first see if the user called the function correctly
|
|
if ($text === null) {
|
|
return \b8\b8::CLASSIFIER_TEXT_MISSING;
|
|
}
|
|
|
|
// Get the internal database variables, containing the number of ham and spam texts so the
|
|
// spam probability can be calculated in relation to them
|
|
$internals = $this->storage->get_internals();
|
|
|
|
// Calculate the spaminess of all tokens
|
|
|
|
// Get all tokens we want to rate
|
|
$tokens = $this->lexer->get_tokens($text);
|
|
|
|
// Check if the lexer failed (if so, $tokens will be a lexer error code, if not, $tokens
|
|
// will be an array)
|
|
if (! is_array($tokens)) {
|
|
return $tokens;
|
|
}
|
|
|
|
// Fetch all available data for the token set from the database
|
|
$this->token_data = $this->storage->get(array_keys($tokens));
|
|
|
|
// Calculate the spaminess and importance for each token (or a degenerated form of it)
|
|
|
|
$word_count = [];
|
|
$rating = [];
|
|
$importance = [];
|
|
|
|
foreach ($tokens as $word => $count) {
|
|
$word_count[$word] = $count;
|
|
|
|
// Although we only call this function only here ... let's do the calculation stuff in a
|
|
// function to make this a bit less confusing ;-)
|
|
$rating[$word] = $this->get_probability($word, $internals);
|
|
$importance[$word] = abs(0.5 - $rating[$word]);
|
|
}
|
|
|
|
// Order by importance
|
|
arsort($importance);
|
|
reset($importance);
|
|
|
|
// Get the most interesting tokens (use all if we have less than the given number)
|
|
$relevant = [];
|
|
for ($i = 0; $i < $this->config['use_relevant']; $i++) {
|
|
if ($token = key($importance)) {
|
|
// Important tokens remain
|
|
|
|
// If the token's rating is relevant enough, use it
|
|
if (abs(0.5 - $rating[$token]) > $this->config['min_dev']) {
|
|
// Tokens that appear more than once also count more than once
|
|
for ($x = 0, $l = $word_count[$token]; $x < $l; $x++) {
|
|
array_push($relevant, $rating[$token]);
|
|
}
|
|
}
|
|
} else {
|
|
// We have less words as we want to use, so we already use what we have and can
|
|
// break here
|
|
break;
|
|
}
|
|
|
|
next($importance);
|
|
}
|
|
|
|
// Calculate the spaminess of the text (thanks to Mr. Robinson ;-)
|
|
|
|
// We set both haminess and spaminess to 1 for the first multiplying
|
|
$haminess = 1;
|
|
$spaminess = 1;
|
|
|
|
// Consider all relevant ratings
|
|
foreach ($relevant as $value) {
|
|
$haminess *= (1.0 - $value);
|
|
$spaminess *= $value;
|
|
}
|
|
|
|
// If no token was good for calculation, we really don't know how to rate this text, so
|
|
// we can return 0.5 without further calculations.
|
|
if ($haminess == 1 && $spaminess == 1) {
|
|
return 0.5;
|
|
}
|
|
|
|
// Calculate the combined rating
|
|
|
|
// Get the number of relevant ratings
|
|
$n = count($relevant);
|
|
|
|
// The actual haminess and spaminess
|
|
$haminess = 1 - pow($haminess, (1 / $n));
|
|
$spaminess = 1 - pow($spaminess, (1 / $n));
|
|
|
|
// Calculate the combined indicator
|
|
$probability = ($haminess - $spaminess) / ($haminess + $spaminess);
|
|
|
|
// We want a value between 0 and 1, not between -1 and +1, so ...
|
|
$probability = (1 + $probability) / 2;
|
|
|
|
// Alea iacta est
|
|
return $probability;
|
|
}
|
|
|
|
/**
|
|
* Calculate the spaminess of a single token also considering "degenerated" versions
|
|
*
|
|
* @access private
|
|
* @param string The word to rate
|
|
* @param array The "internals" array
|
|
* @return float The word's rating
|
|
*/
|
|
private function get_probability(string $word, array $internals)
|
|
{
|
|
// Let's see what we have!
|
|
if (isset($this->token_data['tokens'][$word])) {
|
|
// The token is in the database, so we can use it's data as-is and calculate the
|
|
// spaminess of this token directly
|
|
return $this->calculate_probability($this->token_data['tokens'][$word], $internals);
|
|
}
|
|
|
|
// The token was not found, so do we at least have similar words?
|
|
if (isset($this->token_data['degenerates'][$word])) {
|
|
// We found similar words, so calculate the spaminess for each one and choose the most
|
|
// important one for the further calculation
|
|
|
|
// The default rating is 0.5 simply saying nothing
|
|
$rating = 0.5;
|
|
|
|
foreach ($this->token_data['degenerates'][$word] as $degenerate => $count) {
|
|
// Calculate the rating of the current degenerated token
|
|
$rating_tmp = $this->calculate_probability($count, $internals);
|
|
|
|
// Is it more important than the rating of another degenerated version?
|
|
if(abs(0.5 - $rating_tmp) > abs(0.5 - $rating)) {
|
|
$rating = $rating_tmp;
|
|
}
|
|
}
|
|
|
|
return $rating;
|
|
} else {
|
|
// The token is really unknown, so choose the default rating for completely unknown
|
|
// tokens. This strips down to the robX parameter so we can cheap out the freaky math
|
|
// ;-)
|
|
return $this->config['rob_x'];
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Do the actual spaminess calculation of a single token
|
|
*
|
|
* @access private
|
|
* @param array The token's data [ \b8\b8::KEY_COUNT_HAM => int,
|
|
\b8\b8::KEY_COUNT_SPAM => int ]
|
|
* @param array The "internals" array
|
|
* @return float The rating
|
|
*/
|
|
private function calculate_probability(array $data, array $internals)
|
|
{
|
|
// Calculate the basic probability as proposed by Mr. Graham
|
|
|
|
// But: consider the number of ham and spam texts saved instead of the number of entries
|
|
// where the token appeared to calculate a relative spaminess because we count tokens
|
|
// appearing multiple times not just once but as often as they appear in the learned texts.
|
|
|
|
$rel_ham = $data[\b8\b8::KEY_COUNT_HAM];
|
|
$rel_spam = $data[\b8\b8::KEY_COUNT_SPAM];
|
|
|
|
if ($internals[\b8\b8::KEY_TEXTS_HAM] > 0) {
|
|
$rel_ham = $data[\b8\b8::KEY_COUNT_HAM] / $internals[\b8\b8::KEY_TEXTS_HAM];
|
|
}
|
|
|
|
if ($internals[\b8\b8::KEY_TEXTS_SPAM] > 0) {
|
|
$rel_spam = $data[\b8\b8::KEY_COUNT_SPAM] / $internals[\b8\b8::KEY_TEXTS_SPAM];
|
|
}
|
|
|
|
$rating = $rel_spam / ($rel_ham + $rel_spam);
|
|
|
|
// Calculate the better probability proposed by Mr. Robinson
|
|
$all = $data[\b8\b8::KEY_COUNT_HAM] + $data[\b8\b8::KEY_COUNT_SPAM];
|
|
return (($this->config['rob_s'] * $this->config['rob_x']) + ($all * $rating))
|
|
/ ($this->config['rob_s'] + $all);
|
|
}
|
|
|
|
/**
|
|
* Check the validity of the category of a request
|
|
*
|
|
* @access private
|
|
* @param string The category
|
|
* @return void
|
|
*/
|
|
private function check_category(string $category)
|
|
{
|
|
return $category === \b8\b8::HAM || $category === \b8\b8::SPAM;
|
|
}
|
|
|
|
/**
|
|
* Learn a reference text
|
|
*
|
|
* @access public
|
|
* @param string The text to learn
|
|
* @param string Either b8::SPAM or b8::HAM
|
|
* @return mixed void or an error code
|
|
*/
|
|
public function learn(string $text = null, string $category = null)
|
|
{
|
|
// Let's first see if the user called the function correctly
|
|
if ($text === null) {
|
|
return \b8\b8::TRAINER_TEXT_MISSING;
|
|
}
|
|
if ($category === null) {
|
|
return \b8\b8::TRAINER_CATEGORY_MISSING;
|
|
}
|
|
|
|
return $this->process_text($text, $category, \b8\b8::LEARN);
|
|
}
|
|
|
|
/**
|
|
* Unlearn a reference text
|
|
*
|
|
* @access public
|
|
* @param string The text to unlearn
|
|
* @param string Either b8::SPAM or b8::HAM
|
|
* @return mixed void or an error code
|
|
*/
|
|
public function unlearn(string $text = null, string $category = null)
|
|
{
|
|
// Let's first see if the user called the function correctly
|
|
if ($text === null) {
|
|
return \b8\b8::TRAINER_TEXT_MISSING;
|
|
}
|
|
if ($category === null) {
|
|
return \b8\b8::TRAINER_CATEGORY_MISSING;
|
|
}
|
|
|
|
return $this->process_text($text, $category, \b8\b8::UNLEARN);
|
|
}
|
|
|
|
/**
|
|
* Does the actual interaction with the storage backend for learning or unlearning texts
|
|
*
|
|
* @access private
|
|
* @param string The text to process
|
|
* @param string Either b8::SPAM or b8::HAM
|
|
* @param string Either b8::LEARN or b8::UNLEARN
|
|
* @return mixed void or an error code
|
|
*/
|
|
private function process_text(string $text, string $category, string $action)
|
|
{
|
|
// Look if the request is okay
|
|
if (! $this->check_category($category)) {
|
|
return \b8\b8::TRAINER_CATEGORY_FAIL;
|
|
}
|
|
|
|
// Get all tokens from $text
|
|
$tokens = $this->lexer->get_tokens($text);
|
|
|
|
// Check if the lexer failed (if so, $tokens will be a lexer error code, if not, $tokens
|
|
// will be an array)
|
|
if (! is_array($tokens)) {
|
|
return $tokens;
|
|
}
|
|
|
|
// Pass the tokens and what to do with it to the storage backend
|
|
return $this->storage->process_text($tokens, $category, $action);
|
|
}
|
|
|
|
}
|