package votorola.a.diff; // Copyright 2010-2011, Michael Allan. Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Votorola Software"), to deal in the Votorola Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicence, and/or sell copies of the Votorola Software, and to permit persons to whom the Votorola Software is furnished to do so, subject to the following conditions: The preceding copyright notice and this permission notice shall be included in all copies or substantial portions of the Votorola Software. THE VOTOROLA SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE VOTOROLA SOFTWARE OR THE USE OR OTHER DEALINGS IN THE VOTOROLA SOFTWARE. import java.io.*; import java.net.*; import java.nio.charset.*; import java.util.regex.*; import votorola.g.hold.*; import votorola.g.io.*; import votorola.g.lang.*; import votorola.g.logging.*; import votorola.g.net.*; /** A bidirectional transformer of wikitext lines, that breaks incoming wikitext into * multiple lines at punctuation marks. Lines thus broken are marked by an appended * soft-break character. We need this because diff/patch tools are line oriented, * depending on newline characters for context; wheras wikitext is typically * paragraph-oriented and has very long lines. * *

Bracket markers to distinguish meta-content from proper content

* *

The line transformer recognizes markers that distinguish meta-content from proper * content. The meta-content must bracket the page like a header and footer. It is * marked by lines that contain BRAC_LINE_MARKER_PATTERN. These lines and the content * above or below are ignored for differencing purposes. Pages are thus structured as * follows:

* *

  *     The optional high bracket, or header, ends with a line
  *     containing this marker <!--voHiBrac-->
  *
  *     The content proper for differencing purposes
  *
  *     This <!--voLoBrac--> marks the first line
  *     of the optional low bracket, or footer.

* *

Note that this transformer may append a final newline to outgoing wikitext that was * not in the original. This is unlikely to have much effect, because MediaWiki's own * editor has the same behaviour.

* * @see Category:Draft Bracket markers */ public final @ThreadSafe class LineTransformer1 { // In addition to high and low brackets, we might have allowed floating spans of // ignored text in middle. But it would be tricky. The incoming appendFromWiki can // reduce the spans to empty stubs with locally unique identifiers, such that the // outgoing appendToWiki can replace them with the correct original content. The // stubs would have to be paired between the two files, however, such that equivalent // stubs are given equal identifiers that don't show up as differnces in the result. // But how to establish that equivalence? Maybe leave it, till there's a use case. /** Appends a line of incoming text from the wiki. */ void appendFromWiki( final String line, final Writer out ) throws IOException { final int cLast = line.length() - 1; if( cLast >= 0 ) { int trailingSoftbreakCount = 0; // in original int letterCount = 0; char ch = 'a'; // anything unimportant for( int c = 0;; ++c ) { boolean isLast = c == cLast; final char chPrior = ch; ch = line.charAt( c ); boolean toSoftbreak = false; // thus far if( ch == SOFTBREAK_CHAR ) ++trailingSoftbreakCount; else { trailingSoftbreakCount = 0; // reset counter if( Character.isLetter( ch )) ++letterCount; softbreak: { if( isLast ) break softbreak; // not at very end, where it breaks anyway if( letterCount < 3 ) break softbreak; // Not on "1. Title" or ":: indented text". Rather wait for // a few letters to appear. But not too large a count, or a // small insertion might (in some cases) effect a cascade // across multiple lines. if( ch == ' ' ) // on space, depending on adjacent chars { assert !isLast; char chNext = line.charAt( c + 1 ); if( chNext == '<' || chNext == '[' ) toSoftbreak = true; // start tag or or start of link else if( chPrior == '>' || chPrior == ']' // end tag or end of link || chPrior == '-' || chPrior == '\u2014' // em dash || chPrior != '/' // don't break "[http://etc/path/ link]" && Character.getType(chPrior) == Character.OTHER_PUNCTUATION ) // .,;:?! and the like, per http://www.fileformat.info/info/unicode/category/Po/list.htm { while( chNext == ' ' ) // gobble contiguous spaces { out.write( ch ); ++c; isLast = c == cLast; if( isLast ) break; chNext = line.charAt( c + 1 ); } toSoftbreak = !isLast; } } else if( Character.getType(ch) == Character.OTHER_PUNCTUATION ) { assert !isLast; final int cNext = c + 1; final char chNext = line.charAt( cNext ); if( chNext == '[' ) toSoftbreak = true; // "end of phrase,[ref link]" else if( chNext == '<' && cNext < cLast ) { final char chNextNext = line.charAt( cNext + 1 ); toSoftbreak = chNextNext != '/'; // so "end of phrase,", but not "end of ref." } } } } out.write( ch ); if( isLast ) for( int s = 0; s < trailingSoftbreakCount; ++s ) { out.write( SOFTBREAK_CHAR ); // byte-stuff } if( toSoftbreak ) { out.write( SOFTBREAK_CHAR ); // indicator out.write( '\n' ); // actual break } if( isLast ) break; } } out.write( '\n' ); } /** Appends a line of outgoing text, destined for the wiki. This undoes the * incoming transformation. */ public void appendToWiki( final String line, final Writer out ) throws IOException { // cf. votorola.s.gwt.mediawiki.SegmentPatternBuilder int lineLength = line.length(); int trailingSoftbreakCount = 0; for( int c = lineLength - 1; c >= 0; --c ) { if( line.charAt(c) != SOFTBREAK_CHAR ) break; ++trailingSoftbreakCount; } final boolean toBreak; if( IntegerX.isOdd( trailingSoftbreakCount )) // then we inserted a softbreak { --lineLength; // eat the indicator toBreak = false; // eat the break } else toBreak = true; if( trailingSoftbreakCount > 1 ) lineLength -= trailingSoftbreakCount / 2; // un-byte-stuff out.write( line, 0, lineLength ); if( toBreak ) out.write( '\n' ); } /** The pattern of a "low bracket" or "high bracket" marker in a line of wikitext. * Group (1) contains the bracket type. If it equals "Hi", then it marks the last * line of the "high bracket". The high bracket (basically a header) includes all * lines from the start of the page to this line. If it equals "Lo", then it marks * the first line of the "low bracket". The low bracket (a footer) includes all * lines form this one to the end of the page. All lines of the high and low * brackets are ignored for differencing purposes. */ static final Pattern BRAC_LINE_MARKER_PATTERN = Pattern.compile( "" ); /** Downloads the wikitext source of the specified page, and splits it into one or * more temporary files. * * @param idType one of "curid" or "oldid". * @param id the page identifier (curid) or revision identifier (oldid). * @param s the base URI for script execution in the wiki, without a trailing * slash (/). * @param prefix the base {@linkplain File#createTempFile(String,String) prefix} * for the temporary files. * @param fileSpool the spool to hold temporary files. If unwound, the files * will be deleted. * * @return an array of up to three files: [0] a file containing the transformed * content of the high bracket up to and including BRAC_LINE_MARKER_PATTERN * (Hi), or null if there is none; [1] a file containing the transformed main * body of the source text; and [2] a file containing the untransformed low * bracket, up to and including BRAC_LINE_MARKER_PATTERN (Lo), or null if there * is none. Note that only the low bracket file is untransformed. * * @see www.mediawiki.org/wiki/Manual:Parameters_to_index.php */ public File[] fetchPageAsFile( final URI s, final String idType, final int id, final String prefix, final Spool fileSpool ) throws IOException { final HttpURLConnection http; try { final URI uri = new URI( s + "/index.php?action=raw&" + idType + "=" + id ); LoggerX.i(LineTransformer1.class).fine( "querying wiki " + uri ); http = (HttpURLConnection)( uri.toURL().openConnection() ); } catch( URISyntaxException x ) { throw new RuntimeException( x ); } final File file = File.createTempFile( prefix, "." + id ); fileSpool.add( new FileHold( file )); File fileHi = null; File fileLo = null; URLConnectionX.connect( http ); try { final BufferedReader in = new BufferedReader( new InputStreamReader( http.getInputStream(), "UTF-8" )); // assuming UTF-8, maybe FIX by reading the HTTP header try { final Charset nativeCharset = Charset.defaultCharset(); BufferedWriter out = new BufferedWriter( new OutputStreamWriter( new FileOutputStream(file), nativeCharset )); fetch: try { String l; // High bracket and main body // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - for( ;; ) { l = in.readLine(); if( l == null ) break fetch; final Matcher m = BRAC_LINE_MARKER_PATTERN.matcher( l ); final String position = m.find()? m.group(1): null; if( "Lo".equals( position )) break; appendFromWiki( l, out ); // transform if( position != null ) { assert "Hi".equals( position ); out.close(); fileHi = File.createTempFile( prefix + "Hi", "." + id ); fileSpool.add( new FileHold( fileHi )); FileX.copyAs( fileHi, file ); out = new BufferedWriter( new OutputStreamWriter( new FileOutputStream(file), nativeCharset )); } } out.close(); // Low bracket // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - fileLo = File.createTempFile( prefix + "Lo", "." + id ); fileSpool.add( new FileHold( fileLo )); out = new BufferedWriter( new OutputStreamWriter( new FileOutputStream(fileLo), nativeCharset )); for( ;; ) { out.write( l ); out.newLine(); l = in.readLine(); if( l == null ) break; } } finally{ out.close(); } } finally{ in.close(); } } finally{ http.disconnect(); } return new File[] { fileHi, file, fileLo }; } /** Runs a loopback test of this line transformer. */ void test() throws IOException { // Original // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - final String originalText; { final StringBuilder sB = new StringBuilder(); sB.append( "This is line 1, which has a comma.\n" ); sB.append( "This is line 2 and it has no punctuation.\n" ); sB.append( "\n" ); sB.append( "\n" ); sB.append( "The two preceding lines were empty.\n" ); sB.append( "This sentence is broken\n" ); sB.append( "across two lines\n" ); sB.append( "* This is bulleted.\n" ); sB.append( ": This is indented.\n" ); sB.append( ":: This is double indented.\n" ); sB.append( "A. This is a section title\n" ); sB.append( "This line ends with a reference.This is a reference.\n" ); sB.append( "This line ends with a reference link.[This is a reference link.]\n" ); sB.append( "This line has a reference,This is a reference. inside.\n" ); sB.append( "This line has a reference link.[This is a reference link.] inside.\n" ); sB.append( "Here is a tagged phrase in mid-line.\n" ); sB.append( "This line ends with a space. \n" ); sB.append( " This begins with a space.\n" ); sB.append( "This ends with a softbreak." ).append( LineTransformer1.SOFTBREAK_CHAR ).append( '\n' ); sB.append( "A softbreak " ).append( LineTransformer1.SOFTBREAK_CHAR ).append( " is contained here.\n" ); sB.append( "Hello. These two sentences are separated by a double space.\n" ); sB.append( "This,line,has,,.punctuation!but?no.spaces.\n" ); sB.append( "This comma, has an extra space.\n" ); // end text with newline, else test will fail per LineTransformer1 originalText = sB.toString(); } System.out.println( "originalText" ); System.out.println( "------------" ); System.out.println( originalText ); System.out.println( "--- end\n" ); // Transformed in // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - final String transformedInText; { final BufferedReader in = new BufferedReader( new StringReader( originalText )); try { final StringWriter outS = new StringWriter(); final BufferedWriter out = new BufferedWriter( outS ); try { for( ;; ) { final String l = in.readLine(); if( l == null ) break; appendFromWiki( l, out ); } } finally{ out.close(); } transformedInText = outS.toString(); System.out.println( "transformed in" ); System.out.println( "--------------" ); System.out.println( transformedInText ); System.out.println( "--- end\n" ); } finally{ in.close(); } } // Transformed back out // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - final String transformedOutText; { final BufferedReader in = new BufferedReader( new StringReader( transformedInText )); try { final StringWriter outS = new StringWriter(); final BufferedWriter out = new BufferedWriter( outS ); try { for( ;; ) { final String l = in.readLine(); if( l == null ) break; appendToWiki( l, out ); } } finally{ out.close(); } transformedOutText = outS.toString(); System.out.println( "transformed out" ); System.out.println( "---------------" ); System.out.println( transformedOutText ); System.out.println( "--- end\n" ); } finally{ in.close(); } } System.out.println( "transformedOutText.equals(originalText) = " + transformedOutText.equals(originalText) ); } //// P r i v a t e /////////////////////////////////////////////////////////////////////// private static final char SOFTBREAK_CHAR = '\u21a9'; }