diff options
Diffstat (limited to 'AntiSpoof/maintenance/generateEquivset.php')
-rw-r--r-- | AntiSpoof/maintenance/generateEquivset.php | 177 |
1 files changed, 0 insertions, 177 deletions
diff --git a/AntiSpoof/maintenance/generateEquivset.php b/AntiSpoof/maintenance/generateEquivset.php deleted file mode 100644 index 687e8470..00000000 --- a/AntiSpoof/maintenance/generateEquivset.php +++ /dev/null @@ -1,177 +0,0 @@ -<?php - -use UtfNormal\Utils; - -$IP = getenv( 'MW_INSTALL_PATH' ); -if ( $IP === false ) { - $IP = __DIR__ . '/../../..'; -} -require_once "$IP/maintenance/Maintenance.php"; - -class GenerateEquivset extends Maintenance { - public function __construct() { - parent::__construct(); - - $this->requireExtension( 'AntiSpoof' ); - } - - public function execute() { - $dir = __DIR__; - - $endl = "\n"; - - $lines = file( "$dir/equivset.in" ); - if ( !$lines ) { - $this->error( "Unable to open equivset.in\n", 1 ); - } - - $setsFile = fopen( "$dir/equivset.txt", 'w' ); - if ( !$setsFile ) { - $this->error( "Unable to open equivset.txt for writing\n", 1 ); - } - - fwrite( $setsFile, <<<EOT -# This file is generated by generateEquivset.php -# It shows sets of equivalent characters, one set per line, with characters -# separated by whitespace. This file is not used by MediaWiki, rather it is -# intended as a human-readable version of equivset.php, for debugging and -# review purposes. - -EOT - ); - - $outputFile = fopen( "$dir/equivset.php", 'w' ); - if ( !$outputFile ) { - $this->error( "Unable to open equivset.php for writing\n", 1 ); - } - - fwrite( $outputFile, "<?" . "php$endl" . <<<EOT -# This file is generated by generateEquivset.php -# It contains a map of characters, encoded in UTF-8, such that running strtr() -# on a string with this map will cause confusable characters to be reduced to -# a canonical representation. The same array is also available in serialized -# form, in equivset.ser. - -EOT - ); - - $serializedFile = fopen( "$dir/equivset.ser", 'w' ); - if ( !$serializedFile ) { - $this->error( "Unable to open equivset.ser for writing\n", 1 ); - } - - # \s matches \xa0 in non-unicode mode, which is not what we want - # So we need to make our own whitespace class - $sp = '[\ \t]'; - - $lineNum = 0; - $setsByChar = []; - $sets = array(); - $exitStatus = 0; - - foreach ( $lines as $line ) { - ++$lineNum; - - # Whether the line ends with a nul character - $mapToEmpty = ( strpos( $line, "\0" ) === strlen( $line ) - 2 ); - - $line = trim( $line ); - - # Filter comments - if ( !$line || $line[0] == '#' ) { - continue; - } - - # Process line - if ( !preg_match( - "/^(?P<hexleft> [A-F0-9]+) $sp+ (?P<charleft> .+?) $sp+ => $sp+ " . - "(?:(?P<hexright> [A-F0-9]+) $sp+|) (?P<charright> .+?) $sp* (?: \#.*|) $ /x", - $line, $m - ) - ) { - $this->output( "Error: invalid entry at line $lineNum: $line\n" ); - $exitStatus = 1; - continue; - } - $error = false; - - if ( Utils::codepointToUtf8( hexdec( $m['hexleft'] ) ) != $m['charleft'] ) { - $actual = Utils::utf8ToCodepoint( $m['charleft'] ); - if ( $actual === false ) { - $this->output( "Bytes: " . strlen( $m['charleft'] ) . "\n" ); - $this->output( bin2hex( $line ) . "\n" ); - $hexForm = bin2hex( $m['charleft'] ); - $this->output( "Invalid UTF-8 character \"{$m['charleft']}\" ($hexForm) at " . - "line $lineNum: $line\n" ); - } else { - $this->output( "Error: left number ({$m['hexleft']}) does not match left " . - "character ($actual) at line $lineNum: $line\n" ); - } - $error = true; - } - if ( !empty( $m['hexright'] ) - && Utils::codepointToUtf8( hexdec( $m['hexright'] ) ) != $m['charright'] - ) { - $actual = Utils::utf8ToCodepoint( $m['charright'] ); - if ( $actual === false ) { - $hexForm = bin2hex( $m['charright'] ); - $this->output( "Invalid UTF-8 character \"{$m['charleft']}\" ($hexForm) at " . - "line $lineNum: $line\n" ); - } else { - $this->output( "Error: right number ({$m['hexright']}) does not match right " . - "character ($actual) at line $lineNum: $line\n" ); - } - $error = true; - } - if ( $error ) { - $exitStatus = 1; - continue; - } - if ( $mapToEmpty || $m['charright'] == 'NUL' ) { - $m['charright'] = ''; - } - - # Find the set for the right character, add a new one if necessary - if ( isset( $setsByChar[$m['charright']] ) ) { - $setName = $setsByChar[$m['charright']]; - $setsByChar[$m['charleft']] = $setsByChar[$m['charright']]; - } else { - $setName = $m['charright']; - $setsByChar[$m['charleft']] = $m['charright']; - } - - if ( !isset( $sets[$setName] ) ) { - $sets[$setName] = [ $setName ]; - } - - $sets[$setName][] = $m['charleft']; - } - - # Sets output - foreach ( $sets as $members ) { - fwrite( $setsFile, implode( ' ', $members ) . $endl ); - } - - # Map output - $output = var_export( $setsByChar, true ); - $output = str_replace( "\n", $endl, $output ); - fwrite( $outputFile, '$equivset = ' . "$output;$endl" ); - - # Serialized file - fwrite( $serializedFile, serialize( $setsByChar ) ); - - fclose( $setsFile ); - fclose( $outputFile ); - fclose( $serializedFile ); - - $text = 'Finished'; - if ( $exitStatus > 0 ) { - $text .= ' with errors'; - } - $this->error( $text, $exitStatus ); - } -} - -$maintClass = "GenerateEquivset"; -require_once DO_MAINTENANCE; - |