summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
Diffstat (limited to 'AntiSpoof/maintenance/generateEquivset.php')
-rw-r--r--AntiSpoof/maintenance/generateEquivset.php177
1 files changed, 0 insertions, 177 deletions
diff --git a/AntiSpoof/maintenance/generateEquivset.php b/AntiSpoof/maintenance/generateEquivset.php
deleted file mode 100644
index 687e8470..00000000
--- a/AntiSpoof/maintenance/generateEquivset.php
+++ /dev/null
@@ -1,177 +0,0 @@
-<?php
-
-use UtfNormal\Utils;
-
-$IP = getenv( 'MW_INSTALL_PATH' );
-if ( $IP === false ) {
- $IP = __DIR__ . '/../../..';
-}
-require_once "$IP/maintenance/Maintenance.php";
-
-class GenerateEquivset extends Maintenance {
- public function __construct() {
- parent::__construct();
-
- $this->requireExtension( 'AntiSpoof' );
- }
-
- public function execute() {
- $dir = __DIR__;
-
- $endl = "\n";
-
- $lines = file( "$dir/equivset.in" );
- if ( !$lines ) {
- $this->error( "Unable to open equivset.in\n", 1 );
- }
-
- $setsFile = fopen( "$dir/equivset.txt", 'w' );
- if ( !$setsFile ) {
- $this->error( "Unable to open equivset.txt for writing\n", 1 );
- }
-
- fwrite( $setsFile, <<<EOT
-# This file is generated by generateEquivset.php
-# It shows sets of equivalent characters, one set per line, with characters
-# separated by whitespace. This file is not used by MediaWiki, rather it is
-# intended as a human-readable version of equivset.php, for debugging and
-# review purposes.
-
-EOT
- );
-
- $outputFile = fopen( "$dir/equivset.php", 'w' );
- if ( !$outputFile ) {
- $this->error( "Unable to open equivset.php for writing\n", 1 );
- }
-
- fwrite( $outputFile, "<?" . "php$endl" . <<<EOT
-# This file is generated by generateEquivset.php
-# It contains a map of characters, encoded in UTF-8, such that running strtr()
-# on a string with this map will cause confusable characters to be reduced to
-# a canonical representation. The same array is also available in serialized
-# form, in equivset.ser.
-
-EOT
- );
-
- $serializedFile = fopen( "$dir/equivset.ser", 'w' );
- if ( !$serializedFile ) {
- $this->error( "Unable to open equivset.ser for writing\n", 1 );
- }
-
- # \s matches \xa0 in non-unicode mode, which is not what we want
- # So we need to make our own whitespace class
- $sp = '[\ \t]';
-
- $lineNum = 0;
- $setsByChar = [];
- $sets = array();
- $exitStatus = 0;
-
- foreach ( $lines as $line ) {
- ++$lineNum;
-
- # Whether the line ends with a nul character
- $mapToEmpty = ( strpos( $line, "\0" ) === strlen( $line ) - 2 );
-
- $line = trim( $line );
-
- # Filter comments
- if ( !$line || $line[0] == '#' ) {
- continue;
- }
-
- # Process line
- if ( !preg_match(
- "/^(?P<hexleft> [A-F0-9]+) $sp+ (?P<charleft> .+?) $sp+ => $sp+ " .
- "(?:(?P<hexright> [A-F0-9]+) $sp+|) (?P<charright> .+?) $sp* (?: \#.*|) $ /x",
- $line, $m
- )
- ) {
- $this->output( "Error: invalid entry at line $lineNum: $line\n" );
- $exitStatus = 1;
- continue;
- }
- $error = false;
-
- if ( Utils::codepointToUtf8( hexdec( $m['hexleft'] ) ) != $m['charleft'] ) {
- $actual = Utils::utf8ToCodepoint( $m['charleft'] );
- if ( $actual === false ) {
- $this->output( "Bytes: " . strlen( $m['charleft'] ) . "\n" );
- $this->output( bin2hex( $line ) . "\n" );
- $hexForm = bin2hex( $m['charleft'] );
- $this->output( "Invalid UTF-8 character \"{$m['charleft']}\" ($hexForm) at " .
- "line $lineNum: $line\n" );
- } else {
- $this->output( "Error: left number ({$m['hexleft']}) does not match left " .
- "character ($actual) at line $lineNum: $line\n" );
- }
- $error = true;
- }
- if ( !empty( $m['hexright'] )
- && Utils::codepointToUtf8( hexdec( $m['hexright'] ) ) != $m['charright']
- ) {
- $actual = Utils::utf8ToCodepoint( $m['charright'] );
- if ( $actual === false ) {
- $hexForm = bin2hex( $m['charright'] );
- $this->output( "Invalid UTF-8 character \"{$m['charleft']}\" ($hexForm) at " .
- "line $lineNum: $line\n" );
- } else {
- $this->output( "Error: right number ({$m['hexright']}) does not match right " .
- "character ($actual) at line $lineNum: $line\n" );
- }
- $error = true;
- }
- if ( $error ) {
- $exitStatus = 1;
- continue;
- }
- if ( $mapToEmpty || $m['charright'] == 'NUL' ) {
- $m['charright'] = '';
- }
-
- # Find the set for the right character, add a new one if necessary
- if ( isset( $setsByChar[$m['charright']] ) ) {
- $setName = $setsByChar[$m['charright']];
- $setsByChar[$m['charleft']] = $setsByChar[$m['charright']];
- } else {
- $setName = $m['charright'];
- $setsByChar[$m['charleft']] = $m['charright'];
- }
-
- if ( !isset( $sets[$setName] ) ) {
- $sets[$setName] = [ $setName ];
- }
-
- $sets[$setName][] = $m['charleft'];
- }
-
- # Sets output
- foreach ( $sets as $members ) {
- fwrite( $setsFile, implode( ' ', $members ) . $endl );
- }
-
- # Map output
- $output = var_export( $setsByChar, true );
- $output = str_replace( "\n", $endl, $output );
- fwrite( $outputFile, '$equivset = ' . "$output;$endl" );
-
- # Serialized file
- fwrite( $serializedFile, serialize( $setsByChar ) );
-
- fclose( $setsFile );
- fclose( $outputFile );
- fclose( $serializedFile );
-
- $text = 'Finished';
- if ( $exitStatus > 0 ) {
- $text .= ' with errors';
- }
- $this->error( $text, $exitStatus );
- }
-}
-
-$maintClass = "GenerateEquivset";
-require_once DO_MAINTENANCE;
-