diff options
author | Yaco <franco@reevo.org> | 2020-06-04 11:01:00 -0300 |
---|---|---|
committer | Yaco <franco@reevo.org> | 2020-06-04 11:01:00 -0300 |
commit | fc7369835258467bf97eb64f184b93691f9a9fd5 (patch) | |
tree | daabd60089d2dd76d9f5fb416b005fbe159c799d /www/wiki/extensions/SpamBlacklist/includes |
first commit
Diffstat (limited to 'www/wiki/extensions/SpamBlacklist/includes')
8 files changed, 1418 insertions, 0 deletions
diff --git a/www/wiki/extensions/SpamBlacklist/includes/ApiSpamBlacklist.php b/www/wiki/extensions/SpamBlacklist/includes/ApiSpamBlacklist.php new file mode 100644 index 00000000..5b91f2bc --- /dev/null +++ b/www/wiki/extensions/SpamBlacklist/includes/ApiSpamBlacklist.php @@ -0,0 +1,72 @@ +<?php +/** + * SpamBlacklist extension API + * + * Copyright © 2013 Wikimedia Foundation + * Based on code by Ian Baker, Victor Vasiliev, Bryan Tong Minh, Roan Kattouw, + * Alex Z., and Jackmcbarn + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + * http://www.gnu.org/copyleft/gpl.html + */ + +/** + * Query module check a URL against the blacklist + * + * @ingroup API + * @ingroup Extensions + */ +class ApiSpamBlacklist extends ApiBase { + + public function execute() { + $params = $this->extractRequestParams(); + $matches = BaseBlacklist::getInstance( 'spam' )->filter( $params['url'], null, true ); + $res = $this->getResult(); + + if ( $matches !== false ) { + // this url is blacklisted. + $res->addValue( 'spamblacklist', 'result', 'blacklisted' ); + $res->setIndexedTagName( $matches, 'match' ); + $res->addValue( 'spamblacklist', 'matches', $matches ); + } else { + // not blacklisted + $res->addValue( 'spamblacklist', 'result', 'ok' ); + } + } + + public function getAllowedParams() { + return [ + 'url' => [ + ApiBase::PARAM_REQUIRED => true, + ApiBase::PARAM_ISMULTI => true, + ] + ]; + } + + /** + * @see ApiBase::getExamplesMessages() + * @return array + */ + protected function getExamplesMessages() { + return [ + 'action=spamblacklist&url=http://www.example.com/|http://www.example.org/' + => 'apihelp-spamblacklist-example-1', + ]; + } + + public function getHelpUrls() { + return [ 'https://www.mediawiki.org/wiki/Extension:SpamBlacklist/API' ]; + } +} diff --git a/www/wiki/extensions/SpamBlacklist/includes/BaseBlacklist.php b/www/wiki/extensions/SpamBlacklist/includes/BaseBlacklist.php new file mode 100644 index 00000000..add77e79 --- /dev/null +++ b/www/wiki/extensions/SpamBlacklist/includes/BaseBlacklist.php @@ -0,0 +1,448 @@ +<?php + +/** + * Base class for different kinds of blacklists + */ +abstract class BaseBlacklist { + /** + * Array of blacklist sources + * + * @var array + */ + public $files = []; + + /** + * Array containing regexes to test against + * + * @var bool|array + */ + protected $regexes = false; + + /** + * Chance of receiving a warning when the filter is hit + * + * @var int + */ + public $warningChance = 100; + + /** + * @var int + */ + public $warningTime = 600; + + /** + * @var int + */ + public $expiryTime = 900; + + /** + * Array containing blacklists that extend BaseBlacklist + * + * @var array + */ + private static $blacklistTypes = [ + 'spam' => 'SpamBlacklist', + 'email' => 'EmailBlacklist', + ]; + + /** + * Array of blacklist instances + * + * @var array + */ + private static $instances = []; + + /** + * Constructor + * + * @param array $settings + */ + function __construct( $settings = [] ) { + foreach ( $settings as $name => $value ) { + $this->$name = $value; + } + } + + /** + * @param array $links + * @param Title $title + * @param bool $preventLog + * @return mixed + */ + abstract public function filter( array $links, Title $title, $preventLog = false ); + + /** + * Adds a blacklist class to the registry + * + * @param string $type + * @param string $class + */ + public static function addBlacklistType( $type, $class ) { + self::$blacklistTypes[$type] = $class; + } + + /** + * Return the array of blacklist types currently defined + * + * @return array + */ + public static function getBlacklistTypes() { + return self::$blacklistTypes; + } + + /** + * @return SpamBlacklist + */ + public static function getSpamBlacklist() { + return self::getInstance( 'spam' ); + } + + /** + * @return EmailBlacklist + */ + public static function getEmailBlacklist() { + return self::getInstance( 'email' ); + } + + /** + * Returns an instance of the given blacklist + * + * @deprecated Use getSpamBlacklist() or getEmailBlacklist() instead + * @param string $type Code for the blacklist + * @return BaseBlacklist + * @throws Exception + */ + public static function getInstance( $type ) { + if ( !isset( self::$blacklistTypes[$type] ) ) { + throw new Exception( "Invalid blacklist type '$type' passed to " . __METHOD__ ); + } + + if ( !isset( self::$instances[$type] ) ) { + global $wgBlacklistSettings; + + // Prevent notices + if ( !isset( $wgBlacklistSettings[$type] ) ) { + $wgBlacklistSettings[$type] = []; + } + + $class = self::$blacklistTypes[$type]; + self::$instances[$type] = new $class( $wgBlacklistSettings[$type] ); + } + + return self::$instances[$type]; + } + + /** + * Returns the code for the blacklist implementation + * + * @return string + */ + abstract protected function getBlacklistType(); + + /** + * Check if the given local page title is a spam regex source. + * + * @param Title $title + * @return bool + */ + public static function isLocalSource( Title $title ) { + global $wgDBname, $wgBlacklistSettings; + + if ( $title->getNamespace() == NS_MEDIAWIKI ) { + $sources = []; + foreach ( self::$blacklistTypes as $type => $class ) { + $type = ucfirst( $type ); + $sources += [ + "$type-blacklist", + "$type-whitelist" + ]; + } + + if ( in_array( $title->getDBkey(), $sources ) ) { + return true; + } + } + + $thisHttp = wfExpandUrl( $title->getFullUrl( 'action=raw' ), PROTO_HTTP ); + $thisHttpRegex = '/^' . preg_quote( $thisHttp, '/' ) . '(?:&.*)?$/'; + + $files = []; + foreach ( self::$blacklistTypes as $type => $class ) { + if ( isset( $wgBlacklistSettings[$type]['files'] ) ) { + $files += $wgBlacklistSettings[$type]['files']; + } + } + + foreach ( $files as $fileName ) { + $matches = []; + if ( preg_match( '/^DB: (\w*) (.*)$/', $fileName, $matches ) ) { + if ( $wgDBname == $matches[1] ) { + if ( $matches[2] == $title->getPrefixedDbKey() ) { + // Local DB fetch of this page... + return true; + } + } + } elseif ( preg_match( $thisHttpRegex, $fileName ) ) { + // Raw view of this page + return true; + } + } + + return false; + } + + /** + * Returns the type of blacklist from the given title + * + * @todo building a regex for this is pretty overkill + * @param Title $title + * @return bool|string + */ + public static function getTypeFromTitle( Title $title ) { + global $wgContLang; + + $types = array_map( [ $wgContLang, 'ucfirst' ], array_keys( self::$blacklistTypes ) ); + $regex = '/(' . implode( '|', $types ). ')-(?:blacklist|whitelist)/'; + + if ( preg_match( $regex, $title->getDBkey(), $m ) ) { + return strtolower( $m[1] ); + } + + return false; + } + + /** + * Fetch local and (possibly cached) remote blacklists. + * Will be cached locally across multiple invocations. + * @return array set of regular expressions, potentially empty. + */ + function getBlacklists() { + if ( $this->regexes === false ) { + $this->regexes = array_merge( + $this->getLocalBlacklists(), + $this->getSharedBlacklists() ); + } + return $this->regexes; + } + + /** + * Returns the local blacklist + * + * @return array Regular expressions + */ + public function getLocalBlacklists() { + $that = $this; + $type = $this->getBlacklistType(); + + return ObjectCache::getMainWANInstance()->getWithSetCallback( + wfMemcKey( 'spamblacklist', $type, 'blacklist-regex' ), + $this->expiryTime, + function () use ( $that, $type ) { + return SpamRegexBatch::regexesFromMessage( "{$type}-blacklist", $that ); + } + ); + } + + /** + * Returns the (local) whitelist + * + * @return array Regular expressions + */ + public function getWhitelists() { + $that = $this; + $type = $this->getBlacklistType(); + + return ObjectCache::getMainWANInstance()->getWithSetCallback( + wfMemcKey( 'spamblacklist', $type, 'whitelist-regex' ), + $this->expiryTime, + function () use ( $that, $type ) { + return SpamRegexBatch::regexesFromMessage( "{$type}-whitelist", $that ); + } + ); + } + + /** + * Fetch (possibly cached) remote blacklists. + * @return array + */ + function getSharedBlacklists() { + $listType = $this->getBlacklistType(); + + wfDebugLog( 'SpamBlacklist', "Loading $listType regex..." ); + + if ( count( $this->files ) == 0 ) { + # No lists + wfDebugLog( 'SpamBlacklist', "no files specified\n" ); + return []; + } + + $miss = false; + + $that = $this; + $regexes = ObjectCache::getMainWANInstance()->getWithSetCallback( + // This used to be cached per-site, but that could be bad on a shared + // server where not all wikis have the same configuration. + wfMemcKey( 'spamblacklist', $listType, 'shared-blacklist-regex' ), + $this->expiryTime, + function () use ( $that, &$miss ) { + $miss = true; + return $that->buildSharedBlacklists(); + } + ); + + if ( !$miss ) { + wfDebugLog( 'SpamBlacklist', "Got shared spam regexes from cache\n" ); + } + + return $regexes; + } + + /** + * Clear all primary blacklist cache keys + * + * @note: this method is unused atm + */ + function clearCache() { + $listType = $this->getBlacklistType(); + + $cache = ObjectCache::getMainWANInstance(); + $cache->delete( wfMemcKey( 'spamblacklist', $listType, 'shared-blacklist-regex' ) ); + $cache->delete( wfMemcKey( 'spamblacklist', $listType, 'blacklist-regex' ) ); + $cache->delete( wfMemcKey( 'spamblacklist', $listType, 'whitelist-regex' ) ); + + wfDebugLog( 'SpamBlacklist', "$listType blacklist local cache cleared.\n" ); + } + + function buildSharedBlacklists() { + $regexes = []; + $listType = $this->getBlacklistType(); + # Load lists + wfDebugLog( 'SpamBlacklist', "Constructing $listType blacklist\n" ); + foreach ( $this->files as $fileName ) { + $matches = []; + if ( preg_match( '/^DB: ([\w-]*) (.*)$/', $fileName, $matches ) ) { + $text = $this->getArticleText( $matches[1], $matches[2] ); + } elseif ( preg_match( '/^(https?:)?\/\//', $fileName ) ) { + $text = $this->getHttpText( $fileName ); + } else { + $text = file_get_contents( $fileName ); + wfDebugLog( 'SpamBlacklist', "got from file $fileName\n" ); + } + + // Build a separate batch of regexes from each source. + // While in theory we could squeeze a little efficiency + // out of combining multiple sources in one regex, if + // there's a bad line in one of them we'll gain more + // from only having to break that set into smaller pieces. + $regexes = array_merge( $regexes, + SpamRegexBatch::regexesFromText( $text, $this, $fileName ) ); + } + + return $regexes; + } + + function getHttpText( $fileName ) { + global $wgDBname, $messageMemc; + $listType = $this->getBlacklistType(); + + # HTTP request + # To keep requests to a minimum, we save results into $messageMemc, which is + # similar to $wgMemc except almost certain to exist. By default, it is stored + # in the database + # There are two keys, when the warning key expires, a random thread will refresh + # the real key. This reduces the chance of multiple requests under high traffic + # conditions. + $key = "{$listType}_blacklist_file:$fileName"; + $warningKey = "$wgDBname:{$listType}filewarning:$fileName"; + $httpText = $messageMemc->get( $key ); + $warning = $messageMemc->get( $warningKey ); + + if ( !is_string( $httpText ) || ( !$warning && !mt_rand( 0, $this->warningChance ) ) ) { + wfDebugLog( 'SpamBlacklist', "Loading $listType blacklist from $fileName\n" ); + $httpText = Http::get( $fileName ); + if ( $httpText === false ) { + wfDebugLog( 'SpamBlacklist', "Error loading $listType blacklist from $fileName\n" ); + } + $messageMemc->set( $warningKey, 1, $this->warningTime ); + $messageMemc->set( $key, $httpText, $this->expiryTime ); + } else { + wfDebugLog( 'SpamBlacklist', "Got $listType blacklist from HTTP cache for $fileName\n" ); + } + return $httpText; + } + + /** + * Fetch an article from this or another local MediaWiki database. + * This is probably *very* fragile, and shouldn't be used perhaps. + * + * @param string $wiki + * @param string $article + * @return string + */ + function getArticleText( $wiki, $article ) { + wfDebugLog( 'SpamBlacklist', + "Fetching {$this->getBlacklistType()} blacklist from '$article' on '$wiki'...\n" ); + + $title = Title::newFromText( $article ); + // Load all the relevant tables from the correct DB. + // This assumes that old_text is the actual text or + // that the external store system is at least unified. + if ( is_callable( [ Revision::class, 'getQueryInfo' ] ) ) { + $revQuery = Revision::getQueryInfo( [ 'page', 'text' ] ); + } else { + $revQuery = [ + 'tables' => [ 'revision', 'page', 'text' ], + 'fields' => array_merge( + Revision::selectFields(), + Revision::selectPageFields(), + Revision::selectTextFields() + ), + 'joins' => [ + 'text' => [ 'JOIN', 'old_id=rev_text_id' ] + ], + ]; + } + $row = wfGetDB( DB_REPLICA, [], $wiki )->selectRow( + $revQuery['tables'], + $revQuery['fields'], + [ + 'page_namespace' => $title->getNamespace(), // assume NS IDs match + 'page_title' => $title->getDBkey(), // assume same case rules + ], + __METHOD__, + [], + [ 'page' => [ 'JOIN', 'rev_id=page_latest' ] ] + $revQuery['joins'] + ); + + return $row + ? ContentHandler::getContentText( Revision::newFromRow( $row )->getContent() ) + : false; + } + + /** + * Returns the start of the regex for matches + * + * @return string + */ + public function getRegexStart() { + return '/[a-z0-9_\-.]*'; + } + + /** + * Returns the end of the regex for matches + * + * @param int $batchSize + * @return string + */ + public function getRegexEnd( $batchSize ) { + return ( $batchSize > 0 ) ? '/Sim' : '/im'; + } + + /** + * @param Title $title + * @param string[] $entries + */ + public function warmCachesForFilter( Title $title, array $entries ) { + // subclass this + } +} diff --git a/www/wiki/extensions/SpamBlacklist/includes/EmailBlacklist.php b/www/wiki/extensions/SpamBlacklist/includes/EmailBlacklist.php new file mode 100644 index 00000000..afcc8eb2 --- /dev/null +++ b/www/wiki/extensions/SpamBlacklist/includes/EmailBlacklist.php @@ -0,0 +1,67 @@ +<?php + +/** + * Email Blacklisting + */ +class EmailBlacklist extends BaseBlacklist { + /** + * @param array $links + * @param Title $title + * @param bool $preventLog + * @return mixed + */ + public function filter( array $links, Title $title, $preventLog = false ) { + throw new LogicException( __CLASS__ . ' cannot be used to filter links.' ); + } + + /** + * Returns the code for the blacklist implementation + * + * @return string + */ + protected function getBlacklistType() { + return 'email'; + } + + /** + * Checks a User object for a blacklisted email address + * + * @param User $user + * @return bool True on valid email + */ + public function checkUser( User $user ) { + $blacklists = $this->getBlacklists(); + $whitelists = $this->getWhitelists(); + + // The email to check + $email = $user->getEmail(); + + if ( !count( $blacklists ) ) { + // Nothing to check + return true; + } + + // Check for whitelisted email addresses + if ( is_array( $whitelists ) ) { + wfDebugLog( 'SpamBlacklist', "Excluding whitelisted email addresses from " . + count( $whitelists ) . " regexes: " . implode( ', ', $whitelists ) . "\n" ); + foreach ( $whitelists as $regex ) { + if ( preg_match( $regex, $email ) ) { + // Whitelisted email + return true; + } + } + } + + # Do the match + wfDebugLog( 'SpamBlacklist', "Checking e-mail address against " . count( $blacklists ) . + " regexes: " . implode( ', ', $blacklists ) . "\n" ); + foreach ( $blacklists as $regex ) { + if ( preg_match( $regex, $email ) ) { + return false; + } + } + + return true; + } +} diff --git a/www/wiki/extensions/SpamBlacklist/includes/SpamBlacklist.php b/www/wiki/extensions/SpamBlacklist/includes/SpamBlacklist.php new file mode 100644 index 00000000..a6122bc9 --- /dev/null +++ b/www/wiki/extensions/SpamBlacklist/includes/SpamBlacklist.php @@ -0,0 +1,348 @@ +<?php + +if ( !defined( 'MEDIAWIKI' ) ) { + exit; +} + +use \MediaWiki\MediaWikiServices; +use Wikimedia\Rdbms\Database; + +class SpamBlacklist extends BaseBlacklist { + const STASH_TTL = 180; + const STASH_AGE_DYING = 150; + + /** + * Changes to external links, for logging purposes + * @var array[] + */ + private $urlChangeLog = []; + + /** + * Returns the code for the blacklist implementation + * + * @return string + */ + protected function getBlacklistType() { + return 'spam'; + } + + /** + * Apply some basic anti-spoofing to the links before they get filtered, + * see @bug 12896 + * + * @param string $text + * + * @return string + */ + protected function antiSpoof( $text ) { + $text = str_replace( '.', '.', $text ); + return $text; + } + + /** + * @param string[] $links An array of links to check against the blacklist + * @param Title $title The title of the page to which the filter shall be applied. + * This is used to load the old links already on the page, so + * the filter is only applied to links that got added. If not given, + * the filter is applied to all $links. + * @param bool $preventLog Whether to prevent logging of hits. Set to true when + * the action is testing the links rather than attempting to save them + * (e.g. the API spamblacklist action) + * @param string $mode Either 'check' or 'stash' + * + * @return string[]|bool Matched text(s) if the edit should not be allowed; false otherwise + */ + function filter( array $links, Title $title = null, $preventLog = false, $mode = 'check' ) { + $statsd = MediaWikiServices::getInstance()->getStatsdDataFactory(); + $cache = ObjectCache::getLocalClusterInstance(); + + // If there are no new links, and we are logging, + // mark all of the current links as being removed. + if ( !$links && $this->isLoggingEnabled() ) { + $this->logUrlChanges( $this->getCurrentLinks( $title ), [], [] ); + } + + if ( !$links ) { + return false; + } + + sort( $links ); + $key = $cache->makeKey( + 'blacklist', + $this->getBlacklistType(), + 'pass', + sha1( implode( "\n", $links ) ), + (string)$title + ); + // Skip blacklist checks if nothing matched during edit stashing... + $knownNonMatchAsOf = $cache->get( $key ); + if ( $mode === 'check' ) { + if ( $knownNonMatchAsOf ) { + $statsd->increment( 'spamblacklist.check-stash.hit' ); + + return false; + } else { + $statsd->increment( 'spamblacklist.check-stash.miss' ); + } + } elseif ( $mode === 'stash' ) { + if ( $knownNonMatchAsOf && ( time() - $knownNonMatchAsOf ) < self::STASH_AGE_DYING ) { + return false; // OK; not about to expire soon + } + } + + $blacklists = $this->getBlacklists(); + $whitelists = $this->getWhitelists(); + + if ( count( $blacklists ) ) { + // poor man's anti-spoof, see bug 12896 + $newLinks = array_map( [ $this, 'antiSpoof' ], $links ); + + $oldLinks = []; + if ( $title !== null ) { + $oldLinks = $this->getCurrentLinks( $title ); + $addedLinks = array_diff( $newLinks, $oldLinks ); + } else { + // can't load old links, so treat all links as added. + $addedLinks = $newLinks; + } + + wfDebugLog( 'SpamBlacklist', "Old URLs: " . implode( ', ', $oldLinks ) ); + wfDebugLog( 'SpamBlacklist', "New URLs: " . implode( ', ', $newLinks ) ); + wfDebugLog( 'SpamBlacklist', "Added URLs: " . implode( ', ', $addedLinks ) ); + + if ( !$preventLog ) { + $this->logUrlChanges( $oldLinks, $newLinks, $addedLinks ); + } + + $links = implode( "\n", $addedLinks ); + + # Strip whitelisted URLs from the match + if ( is_array( $whitelists ) ) { + wfDebugLog( 'SpamBlacklist', "Excluding whitelisted URLs from " . count( $whitelists ) . + " regexes: " . implode( ', ', $whitelists ) . "\n" ); + foreach ( $whitelists as $regex ) { + wfSuppressWarnings(); + $newLinks = preg_replace( $regex, '', $links ); + wfRestoreWarnings(); + if ( is_string( $newLinks ) ) { + // If there wasn't a regex error, strip the matching URLs + $links = $newLinks; + } + } + } + + # Do the match + wfDebugLog( 'SpamBlacklist', "Checking text against " . count( $blacklists ) . + " regexes: " . implode( ', ', $blacklists ) . "\n" ); + $retVal = false; + foreach ( $blacklists as $regex ) { + wfSuppressWarnings(); + $matches = []; + $check = ( preg_match_all( $regex, $links, $matches ) > 0 ); + wfRestoreWarnings(); + if ( $check ) { + wfDebugLog( 'SpamBlacklist', "Match!\n" ); + global $wgRequest; + $ip = $wgRequest->getIP(); + $fullUrls = []; + $fullLineRegex = substr( $regex, 0, strrpos( $regex, '/' ) ) . '.*/Sim'; + preg_match_all( $fullLineRegex, $links, $fullUrls ); + $imploded = implode( ' ', $fullUrls[0] ); + wfDebugLog( 'SpamBlacklistHit', "$ip caught submitting spam: $imploded\n" ); + if ( !$preventLog ) { + $this->logFilterHit( $title, $imploded ); // Log it + } + if ( $retVal === false ) { + $retVal = []; + } + $retVal = array_merge( $retVal, $fullUrls[1] ); + } + } + if ( is_array( $retVal ) ) { + $retVal = array_unique( $retVal ); + } + } else { + $retVal = false; + } + + if ( $retVal === false ) { + // Cache the typical negative results + $cache->set( $key, time(), self::STASH_TTL ); + if ( $mode === 'stash' ) { + $statsd->increment( 'spamblacklist.check-stash.store' ); + } + } + + return $retVal; + } + + public function isLoggingEnabled() { + global $wgSpamBlacklistEventLogging; + return $wgSpamBlacklistEventLogging && class_exists( 'EventLogging' ); + } + + /** + * Diff added/removed urls and generate events for them + * + * @param string[] $oldLinks + * @param string[] $newLinks + * @param string[] $addedLinks + */ + public function logUrlChanges( $oldLinks, $newLinks, $addedLinks ) { + if ( !$this->isLoggingEnabled() ) { + return; + } + + $removedLinks = array_diff( $oldLinks, $newLinks ); + foreach ( $addedLinks as $url ) { + $this->logUrlChange( $url, 'insert' ); + } + + foreach ( $removedLinks as $url ) { + $this->logUrlChange( $url, 'remove' ); + } + } + + /** + * Actually push the url change events post-save + * + * @param User $user + * @param Title $title + * @param int $revId + */ + public function doLogging( User $user, Title $title, $revId ) { + if ( !$this->isLoggingEnabled() ) { + return; + } + + $baseInfo = [ + 'revId' => $revId, + 'pageId' => $title->getArticleID(), + 'pageNamespace' => $title->getNamespace(), + 'userId' => $user->getId(), + 'userText' => $user->getName(), + ]; + $changes = $this->urlChangeLog; + // Empty the changes queue in case this function gets called more than once + $this->urlChangeLog = []; + + DeferredUpdates::addCallableUpdate( function () use ( $changes, $baseInfo ) { + foreach ( $changes as $change ) { + EventLogging::logEvent( + 'ExternalLinksChange', + 15716074, + $baseInfo + $change + ); + } + } ); + } + + /** + * Queue log data about change for a url addition or removal + * + * @param string $url + * @param string $action 'insert' or 'remove' + */ + private function logUrlChange( $url, $action ) { + $parsed = wfParseUrl( $url ); + if ( !isset( $parsed['host'] ) ) { + wfDebugLog( 'SpamBlacklist', "Unable to parse $url" ); + return; + } + $info = [ + 'action' => $action, + 'protocol' => $parsed['scheme'], + 'domain' => $parsed['host'], + 'path' => isset( $parsed['path'] ) ? $parsed['path'] : '', + 'query' => isset( $parsed['query'] ) ? $parsed['query'] : '', + 'fragment' => isset( $parsed['fragment'] ) ? $parsed['fragment'] : '', + ]; + + $this->urlChangeLog[] = $info; + } + + /** + * Look up the links currently in the article, so we can + * ignore them on a second run. + * + * WARNING: I can add more *of the same link* with no problem here. + * @param Title $title + * @return array + */ + function getCurrentLinks( Title $title ) { + $cache = ObjectCache::getMainWANInstance(); + return $cache->getWithSetCallback( + // Key is warmed via warmCachesForFilter() from ApiStashEdit + $cache->makeKey( 'external-link-list', $title->getLatestRevID() ), + $cache::TTL_MINUTE, + function ( $oldValue, &$ttl, array &$setOpts ) use ( $title ) { + $dbr = wfGetDB( DB_REPLICA ); + $setOpts += Database::getCacheSetOptions( $dbr ); + + return $dbr->selectFieldValues( + 'externallinks', + 'el_to', + [ 'el_from' => $title->getArticleID() ], // should be zero queries + __METHOD__ + ); + } + ); + } + + public function warmCachesForFilter( Title $title, array $entries ) { + $this->filter( $entries, $title, true /* no logging */, 'stash' ); + } + + /** + * Returns the start of the regex for matches + * + * @return string + */ + public function getRegexStart() { + return '/(?:https?:)?\/\/+[a-z0-9_\-.]*('; + } + + /** + * Returns the end of the regex for matches + * + * @param int $batchSize + * @return string + */ + public function getRegexEnd( $batchSize ) { + return ')' . parent::getRegexEnd( $batchSize ); + } + /** + * Logs the filter hit to Special:Log if + * $wgLogSpamBlacklistHits is enabled. + * + * @param Title $title + * @param string $url URL that the user attempted to add + */ + public function logFilterHit( $title, $url ) { + global $wgUser, $wgLogSpamBlacklistHits; + if ( $wgLogSpamBlacklistHits ) { + $logEntry = new ManualLogEntry( 'spamblacklist', 'hit' ); + $logEntry->setPerformer( $wgUser ); + $logEntry->setTarget( $title ); + $logEntry->setParameters( [ + '4::url' => $url, + ] ); + $logid = $logEntry->insert(); + $log = new LogPage( 'spamblacklist' ); + if ( $log->isRestricted() ) { + // Make sure checkusers can see this action if the log is restricted + // (which is the default) + if ( ExtensionRegistry::getInstance()->isLoaded( 'CheckUser' ) + && class_exists( 'CheckUserHooks' ) + ) { + $rc = $logEntry->getRecentChange( $logid ); + CheckUserHooks::updateCheckUserData( $rc ); + } + } else { + // If the log is unrestricted, publish normally to RC, + // which will also update checkuser + $logEntry->publish( $logid, "rc" ); + } + } + } +} diff --git a/www/wiki/extensions/SpamBlacklist/includes/SpamBlacklistHooks.php b/www/wiki/extensions/SpamBlacklist/includes/SpamBlacklistHooks.php new file mode 100644 index 00000000..ca8c656f --- /dev/null +++ b/www/wiki/extensions/SpamBlacklist/includes/SpamBlacklistHooks.php @@ -0,0 +1,283 @@ +<?php + +/** + * Hooks for the spam blacklist extension + */ +class SpamBlacklistHooks { + + /** + * Hook function for EditFilterMergedContent + * + * @param IContextSource $context + * @param Content $content + * @param Status $status + * @param string $summary + * @param User $user + * @param bool $minoredit + * + * @return bool + */ + static function filterMergedContent( + IContextSource $context, + Content $content, + Status $status, + $summary, + User $user, + $minoredit + ) { + $title = $context->getTitle(); + + // get the link from the not-yet-saved page content. + $editInfo = $context->getWikiPage()->prepareContentForEdit( $content ); + $pout = $editInfo->output; + $links = array_keys( $pout->getExternalLinks() ); + + // HACK: treat the edit summary as a link if it contains anything + // that looks like it could be a URL or e-mail address. + if ( preg_match( '/\S(\.[^\s\d]{2,}|[\/@]\S)/', $summary ) ) { + $links[] = $summary; + } + + $spamObj = BaseBlacklist::getSpamBlacklist(); + $matches = $spamObj->filter( $links, $title ); + + if ( $matches !== false ) { + $status->fatal( 'spamprotectiontext' ); + + foreach ( $matches as $match ) { + $status->fatal( 'spamprotectionmatch', $match ); + } + + $status->apiHookResult = [ + 'spamblacklist' => implode( '|', $matches ), + ]; + } + + // Always return true, EditPage will look at $status->isOk(). + return true; + } + + public static function onParserOutputStashForEdit( + WikiPage $page, + Content $content, + ParserOutput $output + ) { + $links = array_keys( $output->getExternalLinks() ); + $spamObj = BaseBlacklist::getSpamBlacklist(); + $spamObj->warmCachesForFilter( $page->getTitle(), $links ); + } + + /** + * Verify that the user can send emails + * + * @param User &$user + * @param array &$hookErr + * @return bool + */ + public static function userCanSendEmail( &$user, &$hookErr ) { + $blacklist = BaseBlacklist::getEmailBlacklist(); + if ( $blacklist->checkUser( $user ) ) { + return true; + } + + $hookErr = [ 'spam-blacklisted-email', 'spam-blacklisted-email-text', null ]; + + return false; + } + + /** + * Hook function for EditFilter + * Confirm that a local blacklist page being saved is valid, + * and toss back a warning to the user if it isn't. + * + * @param EditPage $editPage + * @param string $text + * @param string $section + * @param string &$hookError + * @return bool + */ + static function validate( EditPage $editPage, $text, $section, &$hookError ) { + $title = $editPage->getTitle(); + $thisPageName = $title->getPrefixedDBkey(); + + if ( !BaseBlacklist::isLocalSource( $title ) ) { + wfDebugLog( 'SpamBlacklist', + "Spam blacklist validator: [[$thisPageName]] not a local blacklist\n" + ); + return true; + } + + $type = BaseBlacklist::getTypeFromTitle( $title ); + if ( $type === false ) { + return true; + } + + $lines = explode( "\n", $text ); + + $badLines = SpamRegexBatch::getBadLines( $lines, BaseBlacklist::getInstance( $type ) ); + if ( $badLines ) { + wfDebugLog( 'SpamBlacklist', + "Spam blacklist validator: [[$thisPageName]] given invalid input lines: " . + implode( ', ', $badLines ) . "\n" + ); + + $badList = "*<code>" . + implode( "</code>\n*<code>", + array_map( 'wfEscapeWikiText', $badLines ) ) . + "</code>\n"; + $hookError = + "<div class='errorbox'>" . + wfMessage( 'spam-invalid-lines' )->numParams( $badLines )->text() . "<br />" . + $badList . + "</div>\n" . + "<br clear='all' />\n"; + } else { + wfDebugLog( 'SpamBlacklist', + "Spam blacklist validator: [[$thisPageName]] ok or empty blacklist\n" + ); + } + + return true; + } + + /** + * Hook function for PageContentSaveComplete + * Clear local spam blacklist caches on page save. + * + * @param WikiPage $wikiPage + * @param User $user + * @param Content $content + * @param string $summary + * @param bool $isMinor + * @param bool $isWatch + * @param string $section + * @param int $flags + * @param Revision|null $revision + * @param Status $status + * @param int $baseRevId + * + * @return bool + */ + static function pageSaveContent( + WikiPage $wikiPage, + User $user, + Content $content, + $summary, + $isMinor, + $isWatch, + $section, + $flags, + $revision, + Status $status, + $baseRevId + ) { + if ( $revision ) { + BaseBlacklist::getSpamBlacklist() + ->doLogging( $user, $wikiPage->getTitle(), $revision->getId() ); + } + + if ( !BaseBlacklist::isLocalSource( $wikiPage->getTitle() ) ) { + return true; + } + + // This sucks because every Blacklist needs to be cleared + foreach ( BaseBlacklist::getBlacklistTypes() as $type => $class ) { + $blacklist = BaseBlacklist::getInstance( $type ); + $blacklist->clearCache(); + } + + return true; + } + + /** + * @param UploadBase $upload + * @param User $user + * @param array $props + * @param string $comment + * @param string $pageText + * @param array|ApiMessage &$error + * @return bool + */ + public static function onUploadVerifyUpload( + UploadBase $upload, + User $user, + array $props, + $comment, + $pageText, + &$error + ) { + $title = $upload->getTitle(); + + // get the link from the not-yet-saved page content. + $content = ContentHandler::makeContent( $pageText, $title ); + $parserOptions = $content->getContentHandler()->makeParserOptions( 'canonical' ); + $output = $content->getParserOutput( $title, null, $parserOptions ); + $links = array_keys( $output->getExternalLinks() ); + + // HACK: treat comment as a link if it contains anything + // that looks like it could be a URL or e-mail address. + if ( preg_match( '/\S(\.[^\s\d]{2,}|[\/@]\S)/', $comment ) ) { + $links[] = $comment; + } + if ( !$links ) { + return true; + } + + $spamObj = BaseBlacklist::getSpamBlacklist(); + $matches = $spamObj->filter( $links, $title ); + + if ( $matches !== false ) { + $error = new ApiMessage( + wfMessage( 'spamprotectiontext' ), + 'spamblacklist', + [ + 'spamblacklist' => [ 'matches' => $matches ], + 'message' => [ + 'key' => 'spamprotectionmatch', + 'params' => $matches[0], + ], + ] + ); + } + + return true; + } + + /** + * @param WikiPage &$article + * @param User &$user + * @param string &$reason + * @param string &$error + */ + public static function onArticleDelete( WikiPage &$article, User &$user, &$reason, &$error ) { + $spam = BaseBlacklist::getSpamBlacklist(); + if ( !$spam->isLoggingEnabled() ) { + return; + } + + // Log the changes, but we only commit them once the deletion has happened. + // We do that since the external links table could get cleared before the + // ArticleDeleteComplete hook runs + $spam->logUrlChanges( $spam->getCurrentLinks( $article->getTitle() ), [], [] ); + } + + /** + * @param WikiPage &$page + * @param User &$user + * @param string $reason + * @param int $id + * @param Content|null $content + * @param LogEntry $logEntry + */ + public static function onArticleDeleteComplete( + &$page, + User &$user, + $reason, + $id, + Content $content = null, + LogEntry $logEntry + ) { + $spam = BaseBlacklist::getSpamBlacklist(); + $spam->doLogging( $user, $page->getTitle(), $page->getLatest() ); + } +} diff --git a/www/wiki/extensions/SpamBlacklist/includes/SpamBlacklistLogFormatter.php b/www/wiki/extensions/SpamBlacklist/includes/SpamBlacklistLogFormatter.php new file mode 100644 index 00000000..e0c20079 --- /dev/null +++ b/www/wiki/extensions/SpamBlacklist/includes/SpamBlacklistLogFormatter.php @@ -0,0 +1,11 @@ +<?php + +class SpamBlacklistLogFormatter extends LogFormatter { + + protected function getMessageParameters() { + $params = parent::getMessageParameters(); + $params[3] = Message::rawParam( htmlspecialchars( $params[3] ) ); + return $params; + } + +} diff --git a/www/wiki/extensions/SpamBlacklist/includes/SpamBlacklistPreAuthenticationProvider.php b/www/wiki/extensions/SpamBlacklist/includes/SpamBlacklistPreAuthenticationProvider.php new file mode 100644 index 00000000..ed87203d --- /dev/null +++ b/www/wiki/extensions/SpamBlacklist/includes/SpamBlacklistPreAuthenticationProvider.php @@ -0,0 +1,14 @@ +<?php + +use MediaWiki\Auth\AbstractPreAuthenticationProvider; + +class SpamBlacklistPreAuthenticationProvider extends AbstractPreAuthenticationProvider { + public function testForAccountCreation( $user, $creator, array $reqs ) { + $blacklist = BaseBlacklist::getEmailBlacklist(); + if ( $blacklist->checkUser( $user ) ) { + return StatusValue::newGood(); + } + + return StatusValue::newFatal( 'spam-blacklisted-email-signup' ); + } +} diff --git a/www/wiki/extensions/SpamBlacklist/includes/SpamRegexBatch.php b/www/wiki/extensions/SpamBlacklist/includes/SpamRegexBatch.php new file mode 100644 index 00000000..d7d70e7e --- /dev/null +++ b/www/wiki/extensions/SpamBlacklist/includes/SpamRegexBatch.php @@ -0,0 +1,175 @@ +<?php + +/** + * Utility class for working with blacklists + */ +class SpamRegexBatch { + /** + * Build a set of regular expressions matching URLs with the list of regex fragments. + * Returns an empty list if the input list is empty. + * + * @param array $lines list of fragments which will match in URLs + * @param BaseBlacklist $blacklist + * @param int $batchSize largest allowed batch regex; + * if 0, will produce one regex per line + * @return array + */ + static function buildRegexes( $lines, BaseBlacklist $blacklist, $batchSize=4096 ) { + # Make regex + # It's faster using the S modifier even though it will usually only be run once + // $regex = 'https?://+[a-z0-9_\-.]*(' . implode( '|', $lines ) . ')'; + // return '/' . str_replace( '/', '\/', preg_replace('|\\\*/|', '/', $regex) ) . '/Sim'; + $regexes = []; + $regexStart = $blacklist->getRegexStart(); + $regexEnd = $blacklist->getRegexEnd( $batchSize ); + $build = false; + foreach ( $lines as $line ) { + if ( substr( $line, -1, 1 ) == "\\" ) { + // Final \ will break silently on the batched regexes. + // Skip it here to avoid breaking the next line; + // warnings from getBadLines() will still trigger on + // edit to keep new ones from floating in. + continue; + } + // FIXME: not very robust size check, but should work. :) + if ( $build === false ) { + $build = $line; + } elseif ( strlen( $build ) + strlen( $line ) > $batchSize ) { + $regexes[] = $regexStart . + str_replace( '/', '\/', preg_replace( '|\\\*/|u', '/', $build ) ) . + $regexEnd; + $build = $line; + } else { + $build .= '|'; + $build .= $line; + } + } + if ( $build !== false ) { + $regexes[] = $regexStart . + str_replace( '/', '\/', preg_replace( '|\\\*/|u', '/', $build ) ) . + $regexEnd; + } + return $regexes; + } + + /** + * Confirm that a set of regexes is either empty or valid. + * + * @param array $regexes set of regexes + * @return bool true if ok, false if contains invalid lines + */ + static function validateRegexes( $regexes ) { + foreach ( $regexes as $regex ) { + wfSuppressWarnings(); + $ok = preg_match( $regex, '' ); + wfRestoreWarnings(); + + if ( $ok === false ) { + return false; + } + } + return true; + } + + /** + * Strip comments and whitespace, then remove blanks + * + * @param array $lines + * @return array + */ + static function stripLines( $lines ) { + return array_filter( + array_map( 'trim', + preg_replace( '/#.*$/', '', + $lines ) ) ); + } + + /** + * Do a sanity check on the batch regex. + * + * @param array $lines unsanitized input lines + * @param BaseBlacklist $blacklist + * @param bool|string $fileName optional for debug reporting + * @return array of regexes + */ + static function buildSafeRegexes( $lines, BaseBlacklist $blacklist, $fileName=false ) { + $lines = self::stripLines( $lines ); + $regexes = self::buildRegexes( $lines, $blacklist ); + if ( self::validateRegexes( $regexes ) ) { + return $regexes; + } else { + // _Something_ broke... rebuild line-by-line; it'll be + // slower if there's a lot of blacklist lines, but one + // broken line won't take out hundreds of its brothers. + if ( $fileName ) { + wfDebugLog( 'SpamBlacklist', "Spam blacklist warning: bogus line in $fileName\n" ); + } + return self::buildRegexes( $lines, $blacklist, 0 ); + } + } + + /** + * Returns an array of invalid lines + * + * @param array $lines + * @param BaseBlacklist $blacklist + * @return array of input lines which produce invalid input, or empty array if no problems + */ + static function getBadLines( $lines, BaseBlacklist $blacklist ) { + $lines = self::stripLines( $lines ); + + $badLines = []; + foreach ( $lines as $line ) { + if ( substr( $line, -1, 1 ) == "\\" ) { + // Final \ will break silently on the batched regexes. + $badLines[] = $line; + } + } + + $regexes = self::buildRegexes( $lines, $blacklist ); + if ( self::validateRegexes( $regexes ) ) { + // No other problems! + return $badLines; + } + + // Something failed in the batch, so check them one by one. + foreach ( $lines as $line ) { + $regexes = self::buildRegexes( [ $line ], $blacklist ); + if ( !self::validateRegexes( $regexes ) ) { + $badLines[] = $line; + } + } + return $badLines; + } + + /** + * Build a set of regular expressions from the given multiline input text, + * with empty lines and comments stripped. + * + * @param string $source + * @param BaseBlacklist $blacklist + * @param bool|string $fileName optional, for reporting of bad files + * @return array of regular expressions, potentially empty + */ + static function regexesFromText( $source, BaseBlacklist $blacklist, $fileName=false ) { + $lines = explode( "\n", $source ); + return self::buildSafeRegexes( $lines, $blacklist, $fileName ); + } + + /** + * Build a set of regular expressions from a MediaWiki message. + * Will be correctly empty if the message isn't present. + * + * @param string $message + * @param BaseBlacklist $blacklist + * @return array of regular expressions, potentially empty + */ + static function regexesFromMessage( $message, BaseBlacklist $blacklist ) { + $source = wfMessage( $message )->inContentLanguage(); + if ( !$source->isDisabled() ) { + return self::regexesFromText( $source->plain(), $blacklist ); + } else { + return []; + } + } +} |