diff --git a/configure.php b/configure.php index 078c66e53..52c372bac 100755 --- a/configure.php +++ b/configure.php @@ -734,7 +734,6 @@ function getFileModificationHistory(): array { globbetyglob("{$ac['basedir']}/scripts", 'make_scripts_executable'); - { # file-entities.php $cmd = array(); @@ -756,7 +755,6 @@ function getFileModificationHistory(): array { } } - checking("for if we should generate a simplified file"); if ($ac["GENERATE"] != "no") { if (!file_exists($ac["GENERATE"])) { diff --git a/entities/global.ent-dist b/entities/global.ent-dist new file mode 100644 index 000000000..1d5c90fe8 --- /dev/null +++ b/entities/global.ent-dist @@ -0,0 +1,19 @@ + + + + + + + \ No newline at end of file diff --git a/entities/manual.ent-dist b/entities/manual.ent-dist new file mode 100644 index 000000000..62ca58550 --- /dev/null +++ b/entities/manual.ent-dist @@ -0,0 +1,14 @@ + + + + + + + + \ No newline at end of file diff --git a/entities/remove.ent-dist b/entities/remove.ent-dist new file mode 100644 index 000000000..6bf8988ad --- /dev/null +++ b/entities/remove.ent-dist @@ -0,0 +1,20 @@ + + + + + + + \ No newline at end of file diff --git a/manual.xml.in b/manual.xml.in index 39213558c..98fa230a6 100644 --- a/manual.xml.in +++ b/manual.xml.in @@ -11,6 +11,11 @@ %language-snippets; @TRANSLATION_ONLY_INCL_END@ + + +%manual-entities; + + @@ -57,7 +62,6 @@ &install.cloud.index; &install.fpm.index; &install.pecl; - &install.composer; &install.ini; diff --git a/scripts/dtdent-conv.php b/scripts/dtdent-conv.php new file mode 100644 index 000000000..777a2cb2d --- /dev/null +++ b/scripts/dtdent-conv.php @@ -0,0 +1,84 @@ + | ++----------------------------------------------------------------------+ +| Description: Convert DTD Entities files into XML Entities files. | ++----------------------------------------------------------------------+ + +See `entities.php` for detailed rationale. + +Use this for converting bundled entities files that use into +XML version used by `entities.php`. + +After converting, add the generated entities in an global.ent or +manual.ent file, and delete the previous one. + +After all old style .ent files are split or converted, this script can +be removed. */ + +ini_set( 'display_errors' , 1 ); +ini_set( 'display_startup_errors' , 1 ); +error_reporting( E_ALL ); + +if ( count( $argv ) < 2 ) + die(" Syntax: php $argv[0] infile\n" ); + +$infile = $argv[1]; + +$content = file_get_contents( $infile ); + +$pos1 = 0; +while ( true ) +{ + $pos1 = strpos( $content , " DOMNodeList (ampunstand intended) + + $name = trim( $name ); + $text = str_replace( "&" , "&" , $text ); + + $frag = "\n"; + $frag .= " $text\n"; + $frag .= ''; + + $dom = new DOMDocument( '1.0' , 'utf8' ); + $dom->recover = true; + $dom->resolveExternals = false; + libxml_use_internal_errors( true ); + + $dom->loadXML( $frag , LIBXML_NSCLEAN ); + $dom->normalizeDocument(); + + libxml_clear_errors(); + + $text = $dom->saveXML( $dom->getElementsByTagName( "entity" )[0] ); + $text = str_replace( "&" , "&" , $text ); + + echo "\n$text\n"; +} diff --git a/scripts/dtdent-split.php b/scripts/dtdent-split.php new file mode 100644 index 000000000..d23863b31 --- /dev/null +++ b/scripts/dtdent-split.php @@ -0,0 +1,123 @@ + | ++----------------------------------------------------------------------+ +| Description: Split old DTD .ent file into individual XML files. | ++----------------------------------------------------------------------+ + +See `entities.php` for detailed rationale. + +Use this for spliting `language-snippets-ent` and possible other DTD +entities files into individual .xml files. + +After spliting, add generated files under doc-lang/entities/ , and +the original file, in one go. + +After all DTD .ent files are split or converted, this script can +be removed. */ + +ini_set( 'display_errors' , 1 ); +ini_set( 'display_startup_errors' , 1 ); +error_reporting( E_ALL ); + +if ( count( $argv ) < 3 ) + die(" Syntax: php $argv[0] infile outdir [hash user]\n" ); + +$infile = $argv[1]; +$outdir = $argv[2]; +$hash = $argv[3] ?? ""; +$user = $argv[4] ?? "_"; + +$content = file_get_contents( $infile ); +$entities = []; + +// Parse + +$pos1 = 0; +while ( true ) +{ + $pos1 = strpos( $content , " $text ) +{ + $file = "$outdir/$name.xml"; + if ( file_exists( $file ) ) + echo( "Entity name colision, OVERWROTE: $file\n" ); +} + +// Write + +foreach( $entities as $name => $text ) +{ + $file = "$outdir/$name.xml"; + + if ( $hash == "" ) + $header = ''; + else + $header .= "\n"; + + file_put_contents( $file , $header . $text ); +} + +// Test + +$dom = new DOMDocument(); +$dom->recover = true; +$dom->resolveExternals = false; +libxml_use_internal_errors( true ); + +foreach( $entities as $name => $text ) +{ + $file = "$outdir/$name.xml"; + + $text = file_get_contents( $file ); + $text = "$text"; + + $dom->loadXML( $text ); + $err = libxml_get_errors(); + libxml_clear_errors(); + + foreach( $err as $e ) + { + $msg = trim( $e->message ); + if ( str_starts_with( $msg , "Entity '" ) && str_ends_with( $msg , "' not defined" ) ) + continue; + die( "Failed to load $file\n" ); + } +} + +$total = count( $entities ); +print "Generated $total files.\n"; diff --git a/scripts/entities.php b/scripts/entities.php new file mode 100644 index 000000000..2e927c8cd --- /dev/null +++ b/scripts/entities.php @@ -0,0 +1,412 @@ + | ++----------------------------------------------------------------------+ +| Description: Collect individual entities into an .entities.ent file. | ++----------------------------------------------------------------------+ + +# Mental model, or things that I would liked to know 20 years prior + +DTD Entity processing has more in common with DOMDocumentFragment than +DOMElement. In other words, simple text and multi rooted XML files +are valid contents, whereas they are not valid XML documents. + +Also, namespaces do not automatically "cross" between a parent +document and their entities, even if they are included in the same +file, as local textual entities. s are, for all intended +purposes, separated documents, with separated namespaces and have +*expected* different default namespaces. + +So each one of, possibly multiple, "root" XML elements inside an +fragment need to be annotated with default namespace, even if the +"root" element occurs surrounded by text. For example: + +- "texttext", need one namespace, or it is invalid, and; +- "", need TWO namespaces, or it is also invalid. + +# Output + +This script collects grouped and individual XML Entity files +(detailed below), at some expected relative paths, and generates an +doc-base/temp/entities.ent file with their respective DTD Entities. + +The output file has no duplications, so collection order is important +to keep the necessary operational semantics. Here, latter loaded entities +takes priority (overrides) an previous defined one. Note that this is the +reverse of DTD convention, where duplicated entity names are +ignored. The priority order used here is important to allow detecting +cases where global entities are being overwritten, or if expected +translatable entities are missing translations. + +# Individual XML Entities, or `.xml` files at `entities/` + +As explained above, the individual entity contents are not really +valid XML *documents*, they are only at most valid XML *fragments*. +More technically, these XML files are really well-balanced texts, per +https://www.w3.org/TR/xml-fragment/#defn-well-balanced . + +Yet, individual entities are stored in entities/ as .xml files, for +two reasons: first, text editors in general can highlights XML syntax in +well-balanced texts; and second, this allows normal revision tracking +per file, without requiring weird changes on `revcheck.php`. Note that +is *invalid* to place XML declaration in these fragment files, at least +in files that are invalid XML documents (on multi-node rooted ones). + +# Grouped entities files, file tracked + +For very small textual entities, down to simple text words or single +tag elements that may never change, individual entity tracking is +an overkill. This script also loads grouped XML Entities files, at +some expected locations, with specific semantics. + +These grouped files are really normal XML files, correctly annotated +with XML namespaces used on manuals, so any individual exported entity +has correct and clean XML namespace annotations. These grouped entity +files are tracked normally by revcheck, but are not directly included +in manual.xml.in, as they only participate in general entity loading, +described above. + +- global.ent - expected unreplaced +- manual.ent - expected replaced (translated) +- remove.ent - expected unused +- lang/entities/* - expected replaced (translated) + +*/ + +const PARTIAL_IMPL = true; // For while XML Entities are not fully implanted in all languages + +ini_set( 'display_errors' , 1 ); +ini_set( 'display_startup_errors' , 1 ); +error_reporting( E_ALL ); + +if ( count( $argv ) < 2 || in_array( '--help' , $argv ) || in_array( '-h' , $argv ) ) +{ + fwrite( STDERR , "\nUsage: {$argv[0]} [--debug] langCode [langCode]\n\n" ); + return; +} + +$filename = Entities::rotateOutputFile(); // idempotent + +$langs = []; +$normal = true; +$debug = false; + +for( $idx = 1 ; $idx < count( $argv ) ; $idx++ ) + if ( $argv[$idx] == "--debug" ) + $normal = false; + else + $langs[] = $argv[$idx]; +$debug = ! $normal; + +if ( $normal ) + print "Creating .entities.ent..."; +else + print "Creating .entities.ent in debug mode.\n"; +$debug = ! $normal; + +loadEnt( __DIR__ . "/../global.ent" , global: true , warnMissing: true ); +foreach( $langs as $lang ) +{ + loadEnt( __DIR__ . "/../../$lang/global.ent" , global: true ); + loadEnt( __DIR__ . "/../../$lang/manual.ent" , translate: true , warnMissing: true ); + loadEnt( __DIR__ . "/../../$lang/remove.ent" , remove: true ); + loadDir( $langs , $lang ); + Entities::$debugUnique = false; +} + +Entities::writeOutputFile(); +Entities::checkReplaces( $debug ); + +echo " done: " , Entities::$countTotalGenerated , " entities"; +if ( Entities::$countUnstranslated > 0 ) + echo ", " , Entities::$countUnstranslated , " untranslated"; +if ( Entities::$countReplacedGlobal > 0 ) + echo ", " , Entities::$countReplacedGlobal , " global replaced"; +if ( Entities::$countReplacedRemove > 0 ) + echo ", " , Entities::$countReplacedRemove , " remove replaced"; +if ( Entities::$countDuplicated > 0 ) + echo ", " , Entities::$countDuplicated , " duplicated (first language)"; +echo ".\n"; + +exit; + +class EntityData +{ + public function __construct( + public string $path , + public string $name , + public string $text ) {} +} + +class Entities +{ + private static string $filename = __DIR__ . "/../temp/entities.ent"; // idempotent + + private static array $entities = []; // All entities, bi duplications + private static array $global = []; // Entities expected not replaced + private static array $replace = []; // Entities expected replaced / translated + private static array $remove = []; // Entities expected not replaced and not used + private static array $unique = []; // For detecting duplicated global+en entities + private static array $count = []; // Name / Count + private static array $slow = []; // External entities, slow, uncontrolled file overwrites + + public static bool $debugUnique = true; // Start on unique mode, disable on second language + + public static int $countUnstranslated = 0; + public static int $countReplacedGlobal = 0; + public static int $countReplacedRemove = 0; + public static int $countTotalGenerated = 0; + public static int $countDuplicated = 0; + + static function put( string $path , string $name , string $text , bool $global = false , bool $replace = false , bool $remove = false ) + { + $entity = new EntityData( $path , $name , $text ); + Entities::$entities[ $name ] = $entity; + + if ( $global ) + Entities::$global[ $name ] = $name; + + if ( $replace ) + Entities::$replace[ $name ] = $name; + + if ( $remove ) + Entities::$remove[ $name ] = $name; + + if ( ! isset( Entities::$count[ $name ] ) ) + Entities::$count[$name] = 1; + else + Entities::$count[$name]++; + + if ( Entities::$debugUnique ) + { + if ( isset( Entities::$unique[ $name ] ) ) + { + Entities::$countDuplicated++; + if ( Entities::$countDuplicated == 1 ) + fwrite( STDERR , "\n\n" ); + fwrite( STDERR , " Duplicated entity: $name\n" ); + } + Entities::$unique[ $name ] = $entity; + } + } + + static function slow( string $path ) + { + if ( isset( $slow[$path] ) ) + fwrite( STDERR , "Unexpected file overwrite: $path\n" ); + $slow[ $path ] = $path; + } + + static function rotateOutputFile() + { + if ( file_exists( Entities::$filename ) ) + unlink( Entities::$filename ); + touch( Entities::$filename ); + Entities::$filename = realpath( Entities::$filename ); // only full paths on XML + } + + static function writeOutputFile() + { + saveEntitiesFile( Entities::$filename , Entities::$entities ); + } + + static function checkReplaces( bool $debug ) + { + Entities::$countTotalGenerated = count( Entities::$entities ); + Entities::$countUnstranslated = 0; + Entities::$countReplacedGlobal = 0; + Entities::$countReplacedRemove = 0; + + foreach( Entities::$entities as $name => $text ) + { + $replaced = Entities::$count[$name] - 1; + $expectedGlobal = in_array( $name , Entities::$global ); + $expectedReplaced = in_array( $name , Entities::$replace ); + $expectedRemoved = in_array( $name , Entities::$remove ); + + if ( $expectedGlobal && $replaced != 0 ) + { + Entities::$countReplacedGlobal++; + if ( $debug ) + print "Expected global, replaced $replaced times: $name\n"; + } + + if ( $expectedReplaced && $replaced != 1 ) + { + Entities::$countUnstranslated++; + if ( $debug ) + print "Expected translated, replaced $replaced times: $name\n"; + } + + if ( $expectedRemoved && $replaced != 0 ) + { + Entities::$countReplacedRemove++; + if ( $debug ) + print "Expected removed, replaced $replaced times: $name\n"; + } + } + } +} + +function loadEnt( string $path , bool $global = false , bool $translate = false , bool $remove = false , bool $warnMissing = false ) +{ + $realpath = realpath( $path ); + if ( $realpath === false ) + if ( PARTIAL_IMPL ) + return; + else + if ( $warnMissing ) + fwrite( STDERR , "\n Missing entity file: $path\n" ); + $path = $realpath; + + $text = file_get_contents( $path ); + $text = str_replace( "&" , "&" , $text ); + + $dom = new DOMDocument( '1.0' , 'utf8' ); + if ( ! $dom->loadXML( $text ) ) + die( "XML load failed for $path\n" ); + + $xpath = new DOMXPath( $dom ); + $list = $xpath->query( "/*/*" ); + + foreach( $list as $ent ) + { + // weird, namespace correting, DOMNodeList -> DOMDocumentFragment transform + $other = new DOMDocument( '1.0' , 'utf8' ); + + foreach( $ent->childNodes as $node ) + $other->appendChild( $other->importNode( $node , true ) ); + + $name = $ent->getAttribute( "name" ); + $text = $other->saveXML(); + + $text = rtrim( $text , "\n" ); + $text = str_replace( "&" , "&" , $text ); + $lines = explode( "\n" , $text ); + array_shift( $lines ); // remove XML declaration + $text = implode( "\n" , $lines ); + + Entities::put( $path , $name , $text , $global , $translate , $remove ); + } +} + +function loadDir( array $langs , string $lang ) +{ + global $debug; + + $dir = __DIR__ . "/../../$lang/entities"; + $dir = realpath( $dir ); + if ( $dir === false || ! is_dir( $dir ) ) + if ( PARTIAL_IMPL ) + { + if ( $debug ) + print "Not a directory: $dir\n"; + return; + } + else + exit( "Error: not a directory: $dir\n" ); + + $files = scandir( $dir ); + $expectedReplaced = array_search( $lang , $langs ) > 0; + + foreach( $files as $file ) + { + $path = realpath( "$dir/$file" ); + + if ( str_starts_with( $file , '.' ) ) + continue; + if ( is_dir( $path ) ) + continue; + + $text = file_get_contents( $path ); + $text = rtrim( $text , "\n" ); + + loadXml( $path , $text , $expectedReplaced ); + } +} + +function loadXml( string $path , string $text , bool $expectedReplaced ) +{ + $info = pathinfo( $path ); + $name = $info["filename"]; + $frag = "$text"; + + if ( trim( $text ) == "" ) + { + if ( ! PARTIAL_IMPL ) + fwrite( STDERR , "\n Empty entity (should it be in remove.ent?): '$path' \n" ); + Entities::put( $path , $name , $text ); + return; + } + + $dom = new DOMDocument( '1.0' , 'utf8' ); + $dom->recover = true; + $dom->resolveExternals = false; + libxml_use_internal_errors( true ); + + $res = $dom->loadXML( $frag ); + + $err = libxml_get_errors(); + libxml_clear_errors(); + + foreach( $err as $item ) + { + $msg = trim( $item->message ); + if ( str_starts_with( $msg , "Entity '" ) && str_ends_with( $msg , "' not defined" ) ) + continue; + + fwrite( STDERR , "\n XML load failed on entity file." ); + fwrite( STDERR , "\n Path: $path" ); + fwrite( STDERR , "\n Error: $msg\n" ); + return; + } + + Entities::put( $path , $name , $text , replace: $expectedReplaced ); +} + +function saveEntitiesFile( string $filename , array $entities ) +{ + $tmpDir = __DIR__ . "/temp"; // idempotent + + $file = fopen( $filename , "w" ); + fputs( $file , "\n\n\n" ); + + foreach( $entities as $name => $entity ) + { + $text = $entity->text; + $quote = ""; + + // If the text contains mixed quoting, keeping it + // as an external file to avoid (re)quotation hell. + + if ( strpos( $text , "'" ) === false ) + $quote = "'"; + if ( strpos( $text , '"' ) === false ) + $quote = '"'; + + if ( $quote == "" ) + { + if ( $entity->path == "" ) + { + $entity->path = $tmpDir . "/{$entity->path}.tmp"; + file_put_contents( $entity->path , $text ); + } + fputs( $file , "path}'>\n\n" ); + Entities::slow( $entity->path ); + } + else + fputs( $file , "\n\n" ); + } + + fclose( $file ); +}