additional_plugins/serendipity_event_spamblock_bayes/b8/lexer/standard.php

267 lines
8.4 KiB
PHP

<?php
/* Copyright (C) 2006-2019 Tobias Leupold <tobias.leupold@gmx.de>
This file is part of the b8 package
This program is free software; you can redistribute it and/or modify it
under the terms of the GNU Lesser General Public License as published by
the Free Software Foundation in version 2.1 of the License.
This program is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
License for more details.
You should have received a copy of the GNU Lesser General Public License
along with this program; if not, write to the Free Software Foundation,
Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
*/
/**
* A helper class to disassemble a text to tokens
*
* @license LGPL 2.1
* @package b8
* @author Tobias Leupold <tobias.leupold@gmx.de>
* @author Oliver Lillie <ollie@buggedcom.co.uk> (original PHP 5 port)
*/
namespace b8\lexer;
class standard
{
const LEXER_TEXT_NOT_STRING = 'LEXER_TEXT_NOT_STRING';
const LEXER_TEXT_EMPTY = 'LEXER_TEXT_EMPTY';
const LEXER_NO_TOKENS = 'b8*no_tokens';
private $config = [ 'min_size' => 3,
'max_size' => 30,
'get_uris' => true,
'get_html' => true,
'get_bbcode' => false,
'allow_numbers' => false ];
private $tokens = null;
private $processed_text = null;
// The regular expressions we use to split the text to tokens
private $regexp = [ 'raw_split' => '/[\s,\.\/"\:;\|<>\-_\[\]{}\+=\)\(\*\&\^%]+/',
'ip' => '/([A-Za-z0-9\_\-\.]+)/',
'uris' => '/([A-Za-z0-9\_\-]*\.[A-Za-z0-9\_\-\.]+)/',
'html' => '/(<.+?>)/',
'bbcode' => '/(\[.+?\])/',
'tagname' => '/(.+?)\s/',
'numbers' => '/^[0-9]+$/' ];
/**
* Constructs the lexer.
*
* @access public
* @param array $config The configuration: [ 'min_size' => int,
* 'max_size' => int,
* 'get_uris' => bool,
* 'get_html' => bool,
* 'get_bbcode' => bool,
* 'allow_numbers' => bool ]
* @return void
*/
function __construct(array $config)
{
// Validate config data
foreach ($config as $name=>$value) {
switch ($name) {
case 'min_size':
case 'max_size':
$this->config[$name] = (int) $value;
break;
case 'allow_numbers':
case 'get_uris':
case 'get_html':
case 'get_bbcode':
$this->config[$name] = (bool) $value;
break;
default:
throw new \Exception(standard::class . ": Unknown configuration key: "
. "\"$name\"");
}
}
}
/**
* Splits a text to tokens.
*
* @access public
* @param string $text The text to disassemble
* @return mixed Returns a list of tokens or an error code
*/
public function get_tokens(string $text)
{
// Check if we actually have a string ...
if (is_string($text) === false) {
return self::LEXER_TEXT_NOT_STRING;
}
// ... and if it's empty
if (empty($text) === true) {
return self::LEXER_TEXT_EMPTY;
}
// Re-convert the text to the original characters coded in UTF-8, as they have been coded in
// html entities during the post process
$this->processed_text = html_entity_decode($text, ENT_QUOTES, 'UTF-8');
// Reset the token list
$this->tokens = array();
if ($this->config['get_uris'] === true) {
// Get URIs
$this->get_uris($this->processed_text);
}
if ($this->config['get_html'] === true) {
// Get HTML
$this->get_markup($this->processed_text, $this->regexp['html']);
}
if ($this->config['get_bbcode'] === true) {
// Get BBCode
$this->get_markup($this->processed_text, $this->regexp['bbcode']);
}
// We always want to do a raw split of the (remaining) text, so:
$this->raw_split($this->processed_text);
// Be sure not to return an empty array
if (count($this->tokens) == 0) {
$this->tokens[self::LEXER_NO_TOKENS] = 1;
}
// Return a list of all found tokens
return $this->tokens;
}
/**
* Validates a token.
*
* @access private
* @param string $token The token string
* @return bool Returns true if the token is valid, otherwise returns false.
*/
private function is_valid(string $token)
{
// Just to be sure that the token's name won't collide with b8's internal variables
if (substr($token, 0, 3) == 'b8*') {
return false;
}
// Validate the size of the token
$len = strlen($token);
if ($len < $this->config['min_size'] || $len > $this->config['max_size']) {
return false;
}
// We may want to exclude pure numbers
if ($this->config['allow_numbers'] === false
&& preg_match($this->regexp['numbers'], $token) > 0) {
return false;
}
// Token is okay
return true;
}
/**
* Checks the validity of a token and adds it to the token list if it's valid.
*
* @access private
* @param string $token
* @param string $word_to_remove Word to remove from the processed string
* @return void
*/
private function add_token(string $token, string $word_to_remove = null)
{
// Check the validity of the token
if (! $this->is_valid($token)) {
return;
}
// Add it to the list or increase it's counter
if (! isset($this->tokens[$token])) {
$this->tokens[$token] = 1;
} else {
$this->tokens[$token] += 1;
}
// If requested, remove the word or it's original version from the text
if ($word_to_remove !== null) {
$this->processed_text = str_replace($word_to_remove, '', $this->processed_text);
}
}
/**
* Gets URIs.
*
* @access private
* @param string $text
* @return void
*/
private function get_uris(string $text)
{
// Find URIs
preg_match_all($this->regexp['uris'], $text, $raw_tokens);
foreach ($raw_tokens[1] as $word) {
// Remove a possible trailing dot
$word = rtrim($word, '.');
// Try to add the found tokens to the list
$this->add_token($word, $word);
// Also process the parts of the found URIs
$this->raw_split($word);
}
}
/**
* Gets HTML or BBCode markup, depending on the regexp used.
*
* @access private
* @param string $text
* @param string $regexp
* @return void
*/
private function get_markup(string $text, string $regexp)
{
// Search for the markup
preg_match_all($regexp, $text, $raw_tokens);
foreach ($raw_tokens[1] as $word) {
$actual_word = $word;
// If the tag has parameters, just use the tag itself
if (strpos($word, ' ') !== false) {
preg_match($this->regexp['tagname'], $word, $match);
$actual_word = $match[1];
$word = "$actual_word..." . substr($word, -1);
}
// Try to add the found tokens to the list
$this->add_token($word, $actual_word);
}
}
/**
* Does a raw split.
*
* @access private
* @param string $text
* @return void
*/
private function raw_split(string $text)
{
foreach (preg_split($this->regexp['raw_split'], $text) as $word) {
// Check the word and add it to the token list if it's valid
$this->add_token($word);
}
}
}