#!/opt/jdk10/bin/jjs --language=es6 /** * wiki-copy/redact - Transform the *wget* copy to match the public face of the original wiki * * This program is idempotent, safe to re-run on a transformed copy. * * Copyright 2018, Michael Allan. Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Votorola Software"), to deal in the Votorola Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicence, and/or sell copies of the Votorola Software, and to permit persons to whom the Votorola Software is furnished to do so, subject to the following conditions: The preceding copyright notice and this permission notice shall be included in all copies or substantial portions of the Votorola Software. THE VOTOROLA SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE VOTOROLA SOFTWARE OR THE USE OR OTHER DEALINGS IN THE VOTOROLA SOFTWARE. */ 'use strict'; ( function() { const ATOMIC_MOVE = Java.type('java.nio.file.StandardCopyOption').ATOMIC_MOVE; const CONTINUE = Java.type('java.nio.file.FileVisitResult').CONTINUE; const Files = Java.type( 'java.nio.file.Files' ); const Paths = Java.type( 'java.nio.file.Paths' ); const SimpleFileVisitor = Java.type( 'java.nio.file.SimpleFileVisitor' ); const StringBuilder = Java.type( 'java.lang.StringBuilder' ); const URI = Java.type( 'java.net.URI' ); /** The directory that comprises the wiki copy. * * @see #mediawikiDirectory * @see #wDirectory */ const wikiCopy = Paths.get( '/mnt/lan/obsidian-unbak/var/www/public/wiki-copy' ); /** The 'w' directory at the base of the wiki copy, corresponding to $wgArticlePath. [WG] * * @see #wikiCopy */ const wDirectory = wikiCopy.resolve( 'w' ); // const WRITE = Java.type('java.nio.file.StandardOpenOption').WRITE; /// ================================================================================================== /// S i m p l e d e c l a r a t i o n s i n l e x i c a l o r d e r /// ================================================================================================== /** @param counts (Map of numbers keyed by string) * @param key (string) * @param value (number) The amount to add. */ function addTo( counts, key, value ) { const count = counts.get( key ); counts.set( key, count === undefined? value: count + value ); } /** @param b (StringBuilder) * @return The same StringBuilder *b*, cleared of content. */ function clearedStringBuilder( b ) { const length = b.length(); if( length > 0 ) b.delete( 0, length ); return b; } const DOT_HTML = '.html'; const DOT_HTML_LENGTH = DOT_HTML.length(); const filePathDirectory = wDirectory.resolve( 'Special:FilePath' ); /** @param from (Path) The referer's directory. * @param ref (java.net.URI) The relative location of the referent. * @return (Path) The referent in normal form. */ function fromDirectoryToReferent( from, ref ) { if( ref.getAuthority() !== null ) throw 'abort: Unexpected reference form: ' + ref; let referent = from.resolve( ref.getPath() ); if( !referent.startsWith( wikiCopy )) { throw 'abort: Local referent outside of wiki copy: ' + referent; } referent = referent.normalize(); // cleaned up from the *resolve* above return referent; } const FTAG_ASCII_BASE_README = 'base README file, US-ASCII'; const FTAG_ASCII_DOT_CSS = '.css minified, US-ASCII'; const FTAG_ASCII_DOT_PHP = '.php, US-ASCII'; const FTAG_ASCII_EDIT_REFUSAL = 'refusal to edit an image file, plain text, US-ASCII'; const FTAG_ASCII_JS_MINIFIED = 'JavaScript minified, US-ASCII'; const FTAG_ASCII_JS = 'JavaScript, US-ASCII'; const FTAG_BINARY_DOT_GIF_TS = '.gif, timestamped'; const FTAG_BINARY_DOT_JPG = '.jpg'; const FTAG_BINARY_DOT_PNG = '.png'; const FTAG_BINARY_DOT_PNG_TS = '.png, timestamped'; const FTAG_UTF8_BASE_LOG = 'base wget.log file, UTF-8'; const FTAG_UTF8_DIR_W_HTML = 'under w/ (+ !w/Special:FilePath/), ∴ HTML, UTF-8'; const FTAG_UTF8_DOT_HTML = '.html, UTF-8'; // Above are the type tags returned by FileTyping.tag. Together they exhaust the files // of the wiki copy. All text files are encoded either in UTF-8, or its subset US-ASCII. /** @param html (string) HTML text. * @return (java.lang.String) The HTML text with each entity reference '&' decoded. */ // java.lang.String's *replace* method differs from JavaScript's function htmlDecoded( html ) { // return html.replace( '&', '&' ); //// Invokes JavaScript's replace, despite the claim, "Nashorn represents strings as //// java.lang.String objects." https://docs.oracle.com/javase/10/nashorn/nashorn-java-api.htm // return html['replace(java.lang.CharSequence,java.lang.CharSequence)']( '&', '&' ); //// Instead gives "TypeError: ... not a function". return html.replace( htmlDecodedRE, '&' ); } const htmlDecodedRE = new RegExp( '&', 'g' ); /* This seems to suffice for Pass4_mendReferences. And for Pass2_reformDirectoryIndeces, exhaustive testing reveals no other entity reference. */ /** @param html (string) HTML text. * @return (java.lang.String) The HTML text with each '&' encoded as an entity reference. */ // java.lang.String as per *htmlDecoded* function htmlEncoded( html ) { // return html.replace( '&', '&' ); //// as per *htmlDecoded*: return html.replace( htmlEncodedRE, '&' ); } const htmlEncodedRE = new RegExp( '&', 'g' ); /** The 'mediawiki' directory at the base of the wiki copy, corresponding to $wgScriptPath. [WG] * * @see #wikiCopy */ const mediawikiDirectory = wikiCopy.resolve( 'mediawiki' ); /** @param text (string) * @param toWidth (number) */ function printPaddingFor( text, toWidth ) { for( let p = toWidth - text.length(); p > 0; --p ) stdout.print( ' ' ); } /** Runs this program. */ function run() { if( eval('var _tmp = null'), typeof _tmp !== 'undefined' ) throw 'abort: Not in strict mode'; // http://www.ecma-international.org/ecma-262/6.0/#sec-strict-mode-code // credit Noseratio, https://stackoverflow.com/a/18916788/2402790 if( !Files.exists( wikiCopy )) throw( 'abort: Missing copy directory: ' + wikiCopy ); Pass1_quantifyFileTypes.run(); Pass2_reformDirectoryIndeces.run(); Pass3_removeFileExtensions.run(); Pass4_mendReferences.run(); stdout.println(); } const STANDARD_INDEX_FILE_NAME_SIMPLEX = 'index'; const STANDARD_INDEX_FILE_NAME = STANDARD_INDEX_FILE_NAME_SIMPLEX + DOT_HTML; const stdout = Java.type('java.lang.System').out; const tmpDirectory_remote = Paths.get( '/mnt/lan/obsidian-unbak/tmp' ); /** A work file for eventual inclusion in the wiki copy. When ready, it is moved into place. */ const tmpFile_import = tmpDirectory_remote.resolve( 'wiki-copy-redact.import' ); /** @param o (java.lang.Object) */ function toString( o ) { // Workaround for *jjs* bug (10.0.1); a sporadic error on calls to Path.toString. // IncompatibleClassChangeError: Found interface java.nio.file.Path, but class was expected // const s = o.toString(); /// here the error is thrown const s = String( o ); return s; } /** @param reference (string) */ function uriCorrected( reference ) { return reference.replace( 'User:Nobody`ZeleaCom', 'User:Nobody%60ZeleaCom' ); // [UPS] } const wDirectory_wgetIndexFile = wikiCopy.resolve( 'w.1.html' ); /** Returns null if the given file is not a *wget* index file, * otherwise returns the directory it indexes. * * @param file (Path) * @param fileName (string) */ function wgetIndexed( file, fileName ) { const m = wgetIndexedRE.exec( fileName ); if( m === null ) return null; const directoryName = m[1]; const directory = file.getParent().resolve( directoryName ); return Files.isDirectory(directory)? directory: null; } /** Executed on the name of a *wget* index file ("foo.png"), * this RegExp captures as a group (1) the name of the indexed directory. * * @see #wgetIndexFileRE */ const wgetIndexedRE = new RegExp( '^(.+?)(?:\.[0-9]+)?\.html$' ); /** @param directoryName (string) * @return (RegExp) * * @see #wgetIndexedRE */ function wgetIndexFileRE( directoryName ) { return new RegExp( '^' + RegularExpressions.quote(directoryName) + '(?:\.[0-9]+)?\.html$' ); /* For directory w/Tor/assorted_division/, e.g., *wget* stores the index in file w/Tor/assorted_division.1.html, though it might instead have stored it in w/Tor/assorted_division.html, or in its proper, final location w/Tor/assorted_division/index.html */ } /// ================================================================================================== /// C o m p o u n d d e c l a r a t i o n s i n l e x i c a l o r d e r /// ================================================================================================== /** Classifying wiki-copy files. */ const FileTyping = ( function() { const that = {}; // the public interface of FileTyping /** Executed on a dot-extended file name ("foo.png"), this RegExp captures as groups * (1) the dot-extension (".png"); and * (2) any timestamp query that *wget* appended to the name. */ const extendedFileNameRE = new RegExp( '(\\.[a-z]+)(\\?\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}Z)?$' ); // the timestamp appendage occurs for image files under mediawiki/skins/*/images/ const LOAD_SCRIPT_NAME = 'load.php'; const LOAD_SCRIPT_NAME_LENGTH = LOAD_SCRIPT_NAME.length(); // - P u b l i c -------------------------------------------------------------------------------- /** Returns a type tag for the given file, namely one of the FTAG_* constants. It is a string * that briefly describes the form of the file's content, including its character encoding. * * @param file (Path) */ that.tag = function( file ) { typify: { const parent = file.getParent(); const fileName = file.getFileName().toString(); if( parent.equals( wikiCopy )) // base files { switch( fileName ) { case 'README': return FTAG_ASCII_BASE_README; case 'w.1.html': return FTAG_UTF8_DOT_HTML; // *wget* index file for wDirectory case 'wget.log': return FTAG_UTF8_BASE_LOG; } break typify; } if( !parent.equals(filePathDirectory) && file.startsWith(wDirectory) ) { return FTAG_UTF8_DIR_W_HTML; // here the initial *wget* copy had only .html files } if( fileName.endsWith( DOT_HTML )) return FTAG_UTF8_DOT_HTML; if( fileName.endsWith( '*.css' )) return FTAG_ASCII_DOT_CSS; // else NOT-DOT-CSS, q.v. below if( parent.equals( mediawikiDirectory )) { if( fileName.startsWith( LOAD_SCRIPT_NAME )) { if( fileName.length() == LOAD_SCRIPT_NAME_LENGTH ) return FTAG_ASCII_DOT_PHP; return FTAG_ASCII_JS_MINIFIED; // depends on NOT-DOT-CSS, q.v. above } if( fileName.endsWith( 'g&action=edit&externaledit=true&mode=file' )) { // ↑ this 'g' being the end of a ".jpg" or ".png" return FTAG_ASCII_EDIT_REFUSAL; } if( fileName.startsWith( 'index.php?title=MediaWiki:Gadget-ReferenceTooltips.js&action=raw&ctype=text%2Fjavascript' )) { // continues and ends with "&5395", or some other number that varies from copy to copy return FTAG_ASCII_JS; } } const match = extendedFileNameRE.exec( fileName ); if( match !== null ) { const ext = match[1]; const ts = match[2]; if( ext == '.png' ) return ts? FTAG_BINARY_DOT_PNG_TS: FTAG_BINARY_DOT_PNG; if( ts ) { if( ext == '.gif' ) return FTAG_BINARY_DOT_GIF_TS; } else if( ext == '.jpg' ) return FTAG_BINARY_DOT_JPG; } } throw 'abort: Unable to determine type of file ' + file; }; // - - - return that; }() ); // ================================================================================================== /** This pass through the wiki copy quantifies the types of file that it contains, * then outputs the result as a table. */ const Pass1_quantifyFileTypes = ( function() { const that = {}; // the public interface of Pass1_quantifyFileTypes /** Map of file counts (number) each keyed by a type tag (string). */ const fileCounts = new Map(); function printResult() { // const rowArray = Array.from( fileCounts ); /// but Nashorn 10 complains, "Array.from is not a function" const rowArray = new Array(); // each row r is an array of cells comprising type tag r[0] and file count r[1] for( const r of fileCounts ) rowArray.push( r ); rowArray.sort( function( c,d ) { return d[1] - c[1];} ); // numerically by count stdout.println(); stdout.println( ' Count Type of file' ); stdout.println( ' ----- ------------' ); for( const r of rowArray ) { const count = String( r[1] ); printPaddingFor( count, /*toWidth*/8 ); stdout.print( count ); stdout.print( ' ' ); stdout.println( r[0] ); // type tag } } // - P u b l i c -------------------------------------------------------------------------------- that.run = function() { stdout.println(); stdout.println( 'Pass1_quantifyFileTypes ...' ); Files.walkFileTree( wikiCopy, new (Java.extend( SimpleFileVisitor )) { /** @param file (Path) */ visitFile: function( file, /*BasicFileAttributes ignored*/_aa ) { addTo( fileCounts, FileTyping.tag(file), 1 ); return CONTINUE; }, }); printResult(); }; // - - - return that; }() ); // ================================================================================================== /** This pass through the wiki copy reforms the non-standard directory index files created by *wget* * by moving them to standard locations and names. Its purpose is to avoid breakage of existing * references into the wiki by restoring the original access locations of directory-like wiki pages * (namely those with sub-pages). It does this by detecting any index file that *wget* has stored * for a directory ("*wget* index file") and reforming it as a standard index file: * moving it from the parent of the directory into the directory itself, and there renaming it. */ const Pass2_reformDirectoryIndeces = ( function() { const that = {}; // the public interface of Pass2_reformDirectoryIndeces const Pattern = Java.type( 'java.util.regex.Pattern' ); const CASE_INSENSITIVE = Pattern.CASE_INSENSITIVE; const URL_ATT_PATTERN = 'action|cite|data|formaction|href|longdesc|manifest|ping|poster|src'; // those with URL-form values, http://w3c.github.io/html/fullindex.html#attributes-table const ATTRIBUTE_DECLARATION_LENGTH_MIN = "a='v'"; // close enough for practical purposes, anyway /** A pattern matcher (Matcher.find) that matches an HTML attribute declaration, * wherein the value is both non-empty and delimited by quotes. * It captures as groups (1) the attribute name; (2) the quote symbol; and (3) the value. */ const attributeDeclarationMatcher = Pattern.compile( '\\b(' + URL_ATT_PATTERN + ')\\s*=\\s*(\'|")\\s*(\\S+?)\\s*\\2', CASE_INSENSITIVE ).matcher( '' ); /** @param directory (Path) */ function ensureIndexForm( directory ) { const directoryName = directory.getFileName().toString(); if( directoryName == STANDARD_INDEX_FILE_NAME_SIMPLEX ) { throw 'abort: Forbidden directory name: ' + directory; // [IFC] } // Detect presence of *wget* index file for directory // ------ const _wgetIndexFileRE = wgetIndexFileRE( directoryName ); // [ARE] let wgetIndexFile = null; const siblingStream = (Files['newDirectoryStream(Path)'])( directory.getParent() ); try { for( const file of siblingStream ) { if( file.equals( directory )) continue; const fileName = file.getFileName().toString(); if( fileName == STANDARD_INDEX_FILE_NAME ) continue; // [IFC] const match = _wgetIndexFileRE.exec( fileName ); if( match === null ) continue; if( wgetIndexFile !== null ) { throw( 'abort: Duplicate *wget* index:\n' + ' *wget* index file ' + file + '\n' + ' duplicates file ' + wgetIndexFile ); } wgetIndexFile = file; }} finally { siblingStream.close(); } if( wgetIndexFile === null ) return; const typeTag = FileTyping.tag( wgetIndexFile ); addTo( indexCountsW, typeTag, 1 ); // Reform detected file as a standard index file // ------ const standardIndexFile = directory.resolve( STANDARD_INDEX_FILE_NAME ); const standardIndexFileExists = Files.exists( standardIndexFile ); /* Occurs once only: *wget* stores the wiki's main page at w.1.html, w/index.html, mediawiki/index.php.html, and w/Wiki:Main_page.html. Regardless the web server redirects all main page requests to w/Wiki:Main_page. See /etc/apache2/5_domain/reluk.ca/public/5_redirect.conf. This simulates the behaviour of MediaWiki. The only reason to delete w.1.html here is its odd location in the base of the wiki copy, which is confusing. */ if( standardIndexFileExists ) { if( !wgetIndexFile.equals( wDirectory_wgetIndexFile )) { throw 'abort: Standard index file already exists, *wget* index is redundant: ' + wgetIndexFile; } stdout.println( ' Deleting base *wget* index file ' + wgetIndexFile ); } else { reformToImport( wgetIndexFile, directory ); Files.move( tmpFile_import, standardIndexFile, ATOMIC_MOVE ); addTo( indexCountsWR, typeTag, 1 ); } Files.delete( wgetIndexFile ); } /** Map of counts (number) of *wget* index files, each keyed by the file's type tag (string). */ const indexCountsW = new Map(); /** Map of counts (number) of *wget* index files that were reformed to standard index files, * each keyed by the file's type tag (string). */ const indexCountsWR = new Map(); const lineBuilder = new StringBuilder( /*initial capacity, guess*/900 ); // for *mendedLine* /** Mends all relative references in the line that will be broken by moving the containing file. * * @param line (string) The line to mend. * @param file (Path) The containing file. * @param directoryOld (Path) The containing file's old directory. * @param directoryNew (Path) The containing file's new directory. * * @return (java.lang.CharSequence) The mended version of the line, * or null if it needs no mending. */ function mendedLine( line, file, directoryOld, directoryNew ) { const cEnd = line.length; if( cEnd < ATTRIBUTE_DECLARATION_LENGTH_MIN ) return null; const m = attributeDeclarationMatcher.reset( line ); if( !m.find() ) return null; const out = clearedStringBuilder( lineBuilder ); // for the mended version of the line let cOut = 0; // cursor of next character within line that is pending for *out* do { // Get a reference that needs mending // --------------- const attName = m.group(1); if( attName == 'ping' ) throw 'abort: Unsupported attribute "' + attName + '"'; // *ping* may contain multiple URLs. See also 'ping' in Pass4_mendReferences. // http://w3c.github.io/html/infrastructure.html#set-of-comma-separated-tokens const ref = uriCorrected( htmlDecoded( m.group( 3 ))); const refU = new URI( ref ); // parsed; either parses cleanly, or throws exception if( refU.isAbsolute() ) continue; // formally external, likely unbreakable // Mend the reference // ------------------ const referent = fromDirectoryToReferent( directoryOld, refU ); const referentName = toString( referent.getFileName() ); if( !Files.exists(referent) && wgetIndexed(referent,referentName) === null ) { // Not if the referent is a *wget* index file. Then its disappearance would be // expected, a run of this Pass having moved it, tripping over itself, so to speak. throw 'abort: Unmendable reference, no such referent: ' + ref; } const relativePathNew = directoryNew.relativize( referent ); // [URU] let ps = toString( relativePathNew ); // path string if( ps.length() === 0 ) ps = '.'; // referent is directoryNew itself else if( toString(referent.getName(0)).contains( ':' )) ps = './' + ps; // as per Pass4_mendReferences.newDirectoryReference if( Files.isDirectory( referent )) ps += '/'; // [EDR] const refMU = new URI( /*scheme*/null, /*authority*/null, /*path*/ps, refU.getQuery(), refU.getFragment() ); // mended version of *refU* const refM = htmlEncoded( refMU.toASCIIString() ); // " of *ref* const qS = m.group( 2 ); // quote symbol out.append( line, cOut, m.start() ); // what is pending up to left edge of this match out.append( attName ).append( '=' ).append( qS ).append( refM ).append( qS ); cOut = m.end(); // now pending from right edge of this match } while( m.find() ); if( out.length() === 0 ) return null; out.append( line, cOut, cEnd ); // the remainder of the line return out; } function printResult() { if( indexCountsW.size === 0 ) return; const rowArray = new Array(); /* Each row r is an array of cells comprising type tag r[0], *wget* index count r[1] and reformed *wget* index count r[2]. */ for( const r of indexCountsW ) { const tag = r[0]; // and r[1] already contains the count of *wget* indeces const count = indexCountsWR.get( tag ); // reformed *wget* indeces r[2] = count === undefined? 0: count; rowArray.push( r ); } rowArray.sort( function( c,d ) { return d[1] - c[1];} ); // on *wget* index count stdout.println(); stdout.println( ' Indeces Reformed Type of file' ); stdout.println( ' ------- -------- ------------' ); for( const r of rowArray ) { let s; stdout.print( ' ' ); s = String( r[1] ); printPaddingFor( s, /*toWidth*/7 ); stdout.print( s ); stdout.print( ' ' ); s = String( r[2] ); printPaddingFor( s, /*toWidth*/7 ); stdout.print( s ); stdout.print( ' ' ); stdout.println( r[0] ); } } /** @param wgetIndexFile (Path) The *wget* index file to reform. * @param directory (Path) The indexed directory. */ function reformToImport( wgetIndexFile, directory ) // to tmpFile_import, that is { const directoryOld = wgetIndexFile.getParent(); let _in = Files.lines( wgetIndexFile ); // [CSE, CSC] let out = Files.newBufferedWriter( tmpFile_import ); // [CSE] try { for( const inII = _in.iterator(); inII.hasNext(); ) { const line = inII.next(); const lineM = mendedLine( line, wgetIndexFile, directoryOld, directory ); out.append( lineM === null? line:lineM ); out.append( '\n' ); }} finally { _in.close(); out.close(); // flushes, too } } // - P u b l i c -------------------------------------------------------------------------------- that.run = function() { stdout.println(); stdout.println( 'Pass2_reformDirectoryIndeces ...' ); Files.walkFileTree( wikiCopy, new (Java.extend( SimpleFileVisitor )) { // /** @param directory (Path) // */ // preVisitDirectory: function( directory, /*BasicFileAttributes ignored*/_aa ) // { // ensureIndexForm( directory ); // return CONTINUE; // }, // //// NoSuchFileException. It seems to be tripping over itself, one sibling (the directory) //// removing another (its *wget* index file). Better wait till all siblings are visited: /** @param directory (Path) * @param x (IOException) */ postVisitDirectory: function( directory, x ) { if( x !== null ) throw x; const childStream = (Files['newDirectoryStream(Path)'])( directory ); try{ for( const c of childStream ) { if( Files.isDirectory( c )) ensureIndexForm( c ); }} finally{ childStream.close(); } return CONTINUE; }, }); printResult(); }; // - - - return that; }() ); // ================================================================================================== /** This pass through the wiki copy reforms the copied article files by removing the '.html' * extension which *wget* adds. Its purpose is to avoid breakage of existing references into * the wiki, all of which lack this extension. * *
This pass depends on Pass2_reformDirectoryIndeces having removed all *wget* index files, * which otherwise it will reform.
*/ const Pass3_removeFileExtensions = ( function() { const that = {}; // the public interface of Pass3_removeFileExtensions /** @param file (Path) */ function ensureForm( file ) { const fileName = file.getFileName().toString(); if( fileName == STANDARD_INDEX_FILE_NAME ) return; if( !fileName.endsWith( DOT_HTML )) return; const fileNameM = fileName.substring // with the '.html' extension removed ( 0, fileName.length() - DOT_HTML_LENGTH ); const fileM = file.getParent().resolve( fileNameM ); if( Files.exists( fileM )) throw( 'abort: Move target already exists: ' + fileM ); Files.move( file, fileM, ATOMIC_MOVE ); ++fileCount; } let fileCount = 0; // - P u b l i c -------------------------------------------------------------------------------- that.run = function() { stdout.println(); stdout.println( 'Pass3_removeFileExtensions ...' ); Files.walkFileTree( wDirectory, new (Java.extend( SimpleFileVisitor )) // Not including mediawikiDirectory, where the purpose of removing file extensions // (to avoid broken references) cannot be fulfilled without also mending // the file names that *wget* mangles by encoding queries into them. { /** @param file (Path) */ visitFile: function( file, /*BasicFileAttributes ignored*/_aa ) { ensureForm( file ); return CONTINUE; }, }); if( fileCount !== 0 ) stdout.println( ' Removed ' + fileCount ); }; // - - - return that; }() ); // ================================================================================================== /** This pass through the wiki copy mends references that were broken by *wget*, or by other passes. * *This pass depends on Pass2_reformDirectoryIndeces having already removed the *wget* index * file wiki-copy/w.1.html from the base, thereby leaving no file in the base that might need * mending. (This pass mends no base file.)
*/ const Pass4_mendReferences = ( function() { const that = {}; // the public interface of Pass4_mendReferences const Character = Java.type( 'java.lang.Character' ); /** @param file (Path) * @param typeTag (string) The file's type tag. */ function ensureFileMended( file, typeTag ) { const _isHTML = isHTML( typeTag ); // [ARE] refCountB = refCountM = 0; fileMT.clear(); let _in = Files.lines( file ); // [CSC, CSE] let out = clearedStringBuilder( fileBuilder ); try { for( const inII = _in.iterator(); inII.hasNext(); ) { const line = inII.next(); const lineM = mendedLine( line, file, typeTag, _isHTML ); out.append( lineM === null? line:lineM ); out.append( '\n' ); }} finally { _in.close(); } if( refCountB === 0 ) return; // no breakable references addTo( refCountsB, typeTag, refCountB ); addTo( fileCountsB, typeTag, 1 ); if( refCountM === 0 ) return; // no mended references _in = out; out = Files.newBufferedWriter( tmpFile_import ); // [CSE] // OPT, the use of *fileBuilder* makes this buffer redundant out.append( _in ); out.close(); // flushes, too Files.move( tmpFile_import, file, ATOMIC_MOVE ); addTo( refCountsM, typeTag, refCountM ); addTo( fileCountsM, typeTag, 1 ); for( const mendingType of fileMT.keys() ) addTo( fileCountsMT, mendingType, 1 ); } const fileBuilder = new StringBuilder( /*initial capacity*/150000 ); // for *ensureFileMended* // the biggest file I happened to see, at 126,625 bytes, is in wiki-copy/mediawiki/ /** Map of counts (number) of referring files that contain breakable references, * each keyed by the file's type tag (string). * * @see #refCountsB */ const fileCountsB = new Map(); /** Map of counts (number) of referring files that contain mended references, * each keyed by the file's type tag (string). * * @see #refCountsM */ const fileCountsM = new Map(); /** Map of counts (number) of referring files that contain mended references, * each keyed by a tag (string) that indicates the type of mending. * * @see #refCountsMT */ const fileCountsMT = new Map(); /** Set of mending type tags (string) for the file currently mending. * * @see #refCountsMT */ const fileMT = new Map(); /** @param mendingTypeTag (string) */ function incrementMT( mendingTypeTag ) { addTo( refCountsMT, mendingTypeTag, 1 ); fileMT.set( mendingTypeTag, 1 ); } /** @param typeTag (string) */ function isHTML( typeTag ) { return typeTag == FTAG_UTF8_DIR_W_HTML || typeTag == FTAG_UTF8_DOT_HTML; } /** Tells whether *ch* looks like part of an HTML *id* attribute. * * @param ch (string) */ function isIdChar( ch ) { if( isLetterOrDigitInASCII( ch )) return true; return ch == '_' || ch == '-' || ch == '.' || ch == ':'; // as per XML *name* syntax } /** Tells whether *ch* is an ASCII letter or digit. * * @param ch (string) */ function isLetterOrDigitInASCII( ch ) { return( ch >= 'a' && ch <= 'z' || ch >= 'A' && ch <= 'Z' || ch >= '0' && ch <= '9' ) } const lineBuilder = new StringBuilder( fileBuilder.capacity() ); // for *mendedLine* // The largest lines are in minified, one-liner files, which are among the largest files. // Therefore I give the line builder and the file builder the same initial capacity. /** @param line (string) The line to mend. * @param file (Path) The containing file. * @param typeTag (string) The type tag of the containing file. * @param isHTML (boolean) * * @return (java.lang.CharSequence) The mended version of the line, * or null if it needs no mending. */ function mendedLine( line, file, typeTag, isHTML ) { const cEnd = line.length; if( cEnd < DOT_HTML_LENGTH ) return null; const out = clearedStringBuilder( lineBuilder ); // for the mended version of the line let cOut = 0; // cursor of next character within line that is pending for *out* refSeek: for( let c = 0, cLastEnd = 0, cNext;; c = cNext ) // reference search cursor c { // ============================ // Resolving the next reference within the line // ============================ // Seek next '.html', indicating a potential reference // ----------------- c = line.indexOf( DOT_HTML, c ); // c now stays on the dot // '.html' references alone need mending ∵ .html files alone are moved or renamed if( c < 0 ) break; cNext = c + DOT_HTML_LENGTH; // What delimiter encloses the reference? // -------------- let cR = cNext; // right scan cursor cR, will settle at right delimiter let delimiter; // the delimiting character to expect on the reference's left side delimiter: { if( cNext >= cEnd ) { delimiter = /*line end*/null; break delimiter; } // Scan rightward to end of reference // -------------- let ch = line.charAt( cR ); // character to right of '.html' if( ch == '.' ) continue refSeek; /* Three cases are possible: 1) not a reference; 2) this dot is part of another extension where the reference continues; or 3) this dot is a punctuation mark and the reference was made in human language, where it cannot have been broken by this, a merely restorative redact. */ ++cNext; // not being a '.', the previous character cannot start the next '.html' if( isLetterOrDigitInASCII( ch )) continue refSeek; // the '.html' continues lexically, rather than terminating a reference fragment: if( ch == '#' ) for( ;; ) // likely a fragment, where reference continues { ++cR; // scanning further if( cR >= cEnd ) { delimiter = /*line end*/null; break delimiter; } ch = line.charAt( cR ); if( ch == '%' ) continue fragment; // encoding delimiter; two hex digits follow if( !isIdChar( ch )) // end of fragment { // Advance the next seek past the fragment, for two reasons: // 1) no breakable reference is likely within a fragment context; and // 2) otherwise the use of cNext in delimiter tests below will fail. cNext = cR; // first advance past the fragment to ch ++cNext; // then, again, ch "not being a '.'" advance one further break fragment; } } // Discern the delimiter // --------------------- if( ch == '"' ) // likely an HTML attribute declaration, the most frequent case { delimiter = ch; break delimiter; } if( ch == '<' ) continue refSeek; /* Likely the echo of an *href* in the body of an HTML *a* element. Not likely a breakable reference itself. */ if( ch == '&' && isHTML ) continue refSeek; /* '&' is reserved for entity and character references. How it could delimit a reference is unclear. */ if( ch == '?' ) continue refSeek; /* Two cases are possible: 1) not a reference; or 2) here is a reference with a '?' query delimeter, proving it to point outside of the wiki copy, wherein *wget* has removed '?' from all internal references. */ if( ch == ' ' || ch == ')' // common in human language || ch == '|' || ch == '}' ) // common in MediaWiki templates { if( isHTML ) continue refSeek; /* Human language or other readable content, most likely. So again, any reference here is probably unbreakable. */ } if( ch == ',' && isHTML ) continue refSeek; // *ping* is the only URL-form attribute that allows ',' - not expected } if( delimiter === undefined ) { throw 'abort: Unable to discern delimiter of .html reference: ' + line; } if( delimiter === /*line end*/null ) { if( isHTML ) continue refSeek; /* Unlikely to be anything but human language. So again, any reference here is probably unbreakable. */ throw 'abort: Unexpected case: .html reference delimited by line end: ' + line; } // Scan leftward to start of reference // ------------- let cL = c; // left scan cursor cL, will settle at left delimiter do if( cL === cLastEnd ) throw 'Missing left delimiter on .html reference: ' + line; while( line.charAt(--cL) != delimiter ); let ref = line.substring( cL + 1, cR ); if( ref == 'Http://reluk.ca/project/votorola/ /javadoc/votorola/a/line/VOMir.html' ) { continue refSeek; // malformed [UPS]; clearly not a breakable reference, anyway } if( line.regionMatches( cL - TITLE_PREFIX_LENGTH, TITLE_PREFIX, 0, TITLE_PREFIX_LENGTH )) continue refSeek; // a title, not a reference ref = uriCorrected( ref ); if( isHTML ) ref = htmlDecoded( ref ); let refU = new URI( ref ); // parsed; either parses cleanly, or throws exception cLastEnd = cR + 1; // end of last cleanly parsed reference, just past right delimiter // =================================== // Dealing with the resolved reference // =================================== // Is the reference breakable? // --------------------------- if( refU.isAbsolute() ) continue refSeek; // formally external, likely unbreakable const directory = file.getParent(); const referent = fromDirectoryToReferent( directory, refU ); ++refCountB; // found a breakable reference, count it // Mend the reference if it is broken // ------------------ let refM = null; // raw mended reference, initially unencoded for HTML mend: { const referentName = toString( referent.getFileName() ); // Mending foo/index.html → foo/ // ----------------------------- if( referentName == STANDARD_INDEX_FILE_NAME ) // then referent is a directory index { refM = newDirectoryReference( /*from*/directory, /*to*/referent.getParent(), refU.getQuery(), refU.getFragment() ); incrementMT( 'foo/index.html → foo/' ); break mend; } if( Files.exists( referent )) continue refSeek; // not yet broken, first another pass must reform the referent // Mending foo.html → foo/ // ----------------------- const indexedDirectory = wgetIndexed( referent, referentName ); if( indexedDirectory !== null ) // then referent is a *wget* index file { const standardIndexFile = indexedDirectory.resolve( STANDARD_INDEX_FILE_NAME ); if( !Files.isRegularFile( standardIndexFile )) { throw 'abort: Unmendable *wget* index reference, no standard index file: ' + ref; } refM = newDirectoryReference( /*from*/directory, /*to*/indexedDirectory, refU.getQuery(), refU.getFragment() ); incrementMT( 'foo.html → foo/' ); break mend; } // Mending foo.html → foo // ---------------------- const referentNameM = referentName.substring // with the '.html' extension removed ( 0, referentName.length() - DOT_HTML_LENGTH ); const referentM = referent.getParent().resolve( referentNameM ); if( Files.isRegularFile( referentM )) { refM = newFileReference( /*from*/directory, /*to*/referentM, refU.getQuery(), refU.getFragment() ); incrementMT( 'foo.html → foo' ); break mend; } throw 'abort: Unmendable reference, no possible referent exists: ' + ref; } // ----- if( refM === null ) continue; // reference was not mended, it is not broken if( isHTML ) refM = htmlEncoded( refM ); out.append( line, cOut, cL ); // viz. what is pending up to left edge of this reference out.append( delimiter ).append( refM ).append( delimiter ); cOut = cLastEnd; // now pending from right edge of this reference ++refCountM; } if( out.length() === 0 ) return null; out.append( line, cOut, cEnd ); // the remainder of the line return out; } /** @param from (Path) The referer's directory. * @param to (Path) The referent directory. * @param query (string) The query to append, or null to append none. * @param fragment (string) The fragment to append, or null to append none. * @return (string) */ function newDirectoryReference( from, to, query, fragment ) { to = from.relativize( to ); // [URU] let ps = toString( to ); // path string if( ps.length() === 0 ) ps = '.'; // *to* and *from* are same directory else if( toString(to.getName(0)).contains( ':' )) ps = './' + ps; // 'If the path is relative, and if its first segment contains a colon character (':'), // then a "." segment is prepended. This prevents a relative URI with a path // such as "a:b/c/d" from later being re-parsed as an opaque URI with a scheme of "a" // and a scheme-specific part of "b/c/d".' [UN] Even the URI construction below // would misparse it so, despite the explicit null scheme. ps += '/'; // [EDR] const refMU = new URI( /*scheme*/null, /*authority*/null, /*path*/ps, query, fragment ); return refMU.toASCIIString(); } /** @param from (Path) The referer's directory. * @param to (Path) The referent file. * @param query (string) The query to append, or null to append none. * @param fragment (string) The fragment to append, or null to append none. * @return (string) */ function newFileReference( from, to, query, fragment ) { to = from.relativize( to ); // [URU] let ps = toString( to ); // path string if( ps.length() === 0 ) throw 'abort: File *to* relativizes as directory *from*'; if( toString(to.getName(0)).contains( ':' )) ps = './' + ps; // as per newDirectoryReference const refMU = new URI( /*scheme*/null, /*authority*/null, /*path*/ps, query, fragment ); return refMU.toASCIIString(); } function printResult() { let rowArray; // References by file type // ----------------------- if( refCountsB.size !== 0 ) { rowArray = new Array(); /* Each row r is an array of cells comprising type tag r[0], breakable reference count r[1], referring file count r[2], mended reference count r[3] and referring file count r[4]. */ for( const r of refCountsB ) { const tag = r[0]; // and r[1] already contains the count of breakable references r[2] = fileCountsB.get( tag ); // referring files for same let count; count = refCountsM.get( tag ); // mended references r[3] = count === undefined? 0: count; count = fileCountsM.get( tag ); // referring files for same r[4] = count === undefined? 0: count; rowArray.push( r ); } rowArray.sort( function( c,d ) { return d[1] - c[1];} ); // on breakable reference count stdout.println(); stdout.println( ' Breakable Mended Type of referring file (file counts bracketed)' ); stdout.println( ' --------------- ------------- ----------------------' ); for( const r of rowArray ) { let s; stdout.print( ' ' ); s = String( r[1] ); printPaddingFor( s, /*toWidth*/8 ); stdout.print( s ); stdout.print( ' ' ); s = String( r[2] ); printPaddingFor( s, /*toWidth*/4 ); stdout.print( '(' ); stdout.print( s ); stdout.print( ') ' ); s = String( r[3] ); printPaddingFor( s, /*toWidth*/6 ); stdout.print( s ); stdout.print( ' ' ); s = String( r[4] ); printPaddingFor( s, /*toWidth*/4 ); stdout.print( '(' ); stdout.print( s ); stdout.print( ') ' ); stdout.println( r[0] ); } } // References by mending type // -------------------------- if( refCountsMT.size === 0 ) return; rowArray = new Array(); /* Each row r is an array of cells comprising type tag r[0], mended reference count r[1] and referring file count r[2]. */ for( const r of refCountsMT ) { const tag = r[0]; // and r[1] already contains the count of mended references r[2] = fileCountsMT.get( tag ); // referring files for same rowArray.push( r ); } rowArray.sort( function( c,d ) { return d[1] - c[1];} ); // on mended reference count stdout.println(); stdout.println( ' Mended Type of mending' ); stdout.println( ' -------------- ---------------' ); for( const r of rowArray ) { let s; stdout.print( ' ' ); s = String( r[1] ); printPaddingFor( s, /*toWidth*/6 ); stdout.print( s ); stdout.print( ' ' ); s = String( r[2] ); printPaddingFor( s, /*toWidth*/5 ); stdout.print( '(' ); stdout.print( s ); stdout.print( ') ' ); stdout.println( r[0] ); } } let refCountB, refCountM; // breakable, mended, per file /** Map of counts (number) of breakable references, each keyed by the referring file's type tag * (string). A breakable reference is one that is expected to break in the other passes, * whether or not it actually does. * * @see #fileCountsB */ const refCountsB = new Map(); /** Map of counts (number) of broken references that were mended, each keyed by the referring * file's type tag (string). No broken reference is left unmended. * * @see #fileCountsM */ const refCountsM = new Map(); /** Map of counts (number) of broken references that were mended, each keyed by a tag (string) * that indicates the type of mending. * * @see #fileCountsMT */ const refCountsMT = new Map(); const TITLE_PREFIX = '"wgTitle":'; // leading part of title declaration in JSON const TITLE_PREFIX_LENGTH = TITLE_PREFIX.length(); // - P u b l i c -------------------------------------------------------------------------------- that.run = function() { stdout.println(); stdout.println( 'Pass4_mendReferences ...' ); Files.walkFileTree( wikiCopy, new (Java.extend( SimpleFileVisitor )) { /** @param file (Path) */ visitFile: function( file, /*BasicFileAttributes ignored*/_aa ) { if( !wikiCopy.equals(file.getParent()) ) /* Skipping wiki-copy base files. Not only has the base no file that needs mending (see "depends on" above), but one that must not be mended, namely the file wget.log. */ { const typeTag = FileTyping.tag( file ); switch( typeTag ) { default: throw 'abort: Unhandled type tag "' + typeTag + '"'; // In practice, the scope here might be narrowed further to HTML files alone; // no other type happens to contain a reference that needs mending. // For sake of robustness, however, try to cover all potential referers: // Amenable text files // ------------------- case FTAG_ASCII_BASE_README: case FTAG_ASCII_DOT_CSS: case FTAG_ASCII_DOT_PHP: case FTAG_ASCII_EDIT_REFUSAL: case FTAG_ASCII_JS: case FTAG_UTF8_BASE_LOG: case FTAG_UTF8_DIR_W_HTML: case FTAG_UTF8_DOT_HTML: ensureFileMended( file, typeTag ); // Problematic text files, skipping these // ---------------------- case FTAG_ASCII_JS_MINIFIED: /* These tend to be large, one-liner files. Some raise exceptions that are hard to deal with, while, in light of the narrowly defined 'Mending' procedures (q.v. further above) none is likely to be broken. The only breaks to mend arise from references to directory indeces and '.html' file extensions that were introduced by *wget* and did not exist in the original wiki, whence these minified scripts come, very likely *untouched by wget*. */ // Binary files, skipping these // ------------ case FTAG_BINARY_DOT_GIF_TS: case FTAG_BINARY_DOT_JPG: case FTAG_BINARY_DOT_PNG: case FTAG_BINARY_DOT_PNG_TS: } } return CONTINUE; }, }); printResult(); }; // - - - return that; }() ); // ================================================================================================== /** Dealing with regular expressions. */ const RegularExpressions = ( function() { const that = {}; // the public interface of RegularExpressions /** A RegExp that matches any character that is safe to quote by prepending a backslash. */ const allSlashQuotablesRE = new RegExp( '[^0-9A-Za-z]', 'g' ); // This assumes that ECMAScript does not deviate from Perl 5's guarantee: // "if the character following the backslash is an ASCII punctuation (non-word) character // (that is, anything that is not a letter, digit, or underscore), then the backslash // just takes away any special meaning of the character following it. // ... So it is safe to put a backslash in front of a non-word character." // http://perldoc.perl.org/perlrebackslash.html // // Cf. http://perldoc.perl.org/functions/quotemeta.html // - P u b l i c -------------------------------------------------------------------------------- /** @param literal (string) A string literal to quote for use within a RegExp pattern. * @return The quoted string. */ that.quote = function( literal ) { // Java's Pattern.quote is incompatible with JavaScript's RegExp; basically it wraps // the literal with a quoting construct that is unsupported by RegExp: '\Q' and '\E'. // Here then is a construct almost equally simple, bullet-proof and future-proof: // return allSlashQuotablesRE[Symbol.replace]( literal, '\\$&' ); // all characters quoted //// Nashorn complains, "allCharacters[Symbol.replace] is not a function", so: return String(literal).replace( allSlashQuotablesRE, '\\$&' ); // all characters quoted // Cf. https://github.com/benjamingr/RegExp.escape/blob/master/polyfill.js }; // - - - return that; }() ); //////////////////// run(); }() ); /** NOTES * ----- * [ARE] Avoiding the 'ReferenceError: "foo" is not defined' that occurs with "const foo = foo()". * * [CSC] Costly string constructions. Costly strings too, if any of the files is a minified, * one-liner. Cf. the more efficient CharBuffer method of /usr/local/lib/TextFilter.js. * * [CSE] Compatible stream encoding. The character encoding of this stream is UTF-8, which, * as the FTAG_* constants assure, is compatible with all text files in the wiki copy. * * [EDR] Explicit directory reference: a trailing '/' to avoid a redirect there by the web server. * * [IFC] Index file conflict. No directory name shall equal STANDARD_INDEX_FILE_NAME_SIMPLEX. * Therefore no index file placed by *redact* can conflict with one placed by *wget*. * * [UN] · https://docs.oracle.com/javase/10/docs/api/java/net/URI.html#normalize() * * [UPS] URL path syntax. https://tools.ietf.org/html/rfc3986#section-3.3 * * [URU] URI.relativize cannot produce '..' paths. Relying instead on Path.relativize. * https://stackoverflow.com/a/705963/2402790 * https://bugs.java.com/bugdatabase/view_bug.do?bug_id=6226081 * * [WG] · The various '$wg' variables are from LocalSettings.php, the wiki's configuration file. */