001package votorola.a.diff; // Copyright 2010-2011, Michael Allan. Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Votorola Software"), to deal in the Votorola Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicence, and/or sell copies of the Votorola Software, and to permit persons to whom the Votorola Software is furnished to do so, subject to the following conditions: The preceding copyright notice and this permission notice shall be included in all copies or substantial portions of the Votorola Software. THE VOTOROLA SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE VOTOROLA SOFTWARE OR THE USE OR OTHER DEALINGS IN THE VOTOROLA SOFTWARE. 002 003import java.io.*; 004import java.net.*; 005import java.nio.charset.*; 006import java.util.regex.*; 007import votorola.g.hold.*; 008import votorola.g.io.*; 009import votorola.g.lang.*; 010import votorola.g.logging.*; 011import votorola.g.net.*; 012 013 014/** A bidirectional transformer of wikitext lines, that breaks incoming wikitext into 015 * multiple lines at punctuation marks. Lines thus broken are marked by an appended 016 * soft-break character. We need this because diff/patch tools are line oriented, 017 * depending on newline characters for context; wheras wikitext is typically 018 * paragraph-oriented and has very long lines. 019 * 020 * <h3>Bracket markers to distinguish meta-content from proper content</h3> 021 * 022 * <p>The line transformer recognizes markers that distinguish meta-content from proper 023 * content. The meta-content must bracket the page like a header and footer. It is 024 * marked by lines that contain BRAC_LINE_MARKER_PATTERN. These lines and the content 025 * above or below are ignored for differencing purposes. Pages are thus structured as 026 * follows:</p> 027 * 028 * <pre> 029 * The optional high bracket, or header, ends with a line 030 * containing this marker <!--voHiBrac--> 031 * 032 * The content proper for differencing purposes 033 * 034 * This <!--voLoBrac--> marks the first line 035 * of the optional low bracket, or footer.</pre> 036 * 037 * <p>Note that this transformer may append a final newline to outgoing wikitext that was 038 * not in the original. This is unlikely to have much effect, because MediaWiki's own 039 * editor has the same behaviour.</p> 040 * 041 * @see <a href='http://reluk.ca/w/Category:Draft#Bracket_markers_to_distinguish_meta-content_from_proper_content' 042 * >Category:Draft Bracket markers</a> 043 */ 044public final @ThreadSafe class LineTransformer1 045{ 046 047 // In addition to high and low brackets, we might have allowed floating spans of 048 // ignored text in middle. But it would be tricky. The incoming appendFromWiki can 049 // reduce the spans to empty stubs with locally unique identifiers, such that the 050 // outgoing appendToWiki can replace them with the correct original content. The 051 // stubs would have to be paired between the two files, however, such that equivalent 052 // stubs are given equal identifiers that don't show up as differnces in the result. 053 // But how to establish that equivalence? Maybe leave it, till there's a use case. 054 055 056 /** Appends a line of incoming text from the wiki. 057 */ 058 void appendFromWiki( final String line, final Writer out ) throws IOException 059 { 060 final int cLast = line.length() - 1; 061 if( cLast >= 0 ) 062 { 063 int trailingSoftbreakCount = 0; // in original 064 int letterCount = 0; 065 char ch = 'a'; // anything unimportant 066 for( int c = 0;; ++c ) 067 { 068 boolean isLast = c == cLast; 069 final char chPrior = ch; 070 ch = line.charAt( c ); 071 boolean toSoftbreak = false; // thus far 072 if( ch == SOFTBREAK_CHAR ) ++trailingSoftbreakCount; 073 else 074 { 075 trailingSoftbreakCount = 0; // reset counter 076 if( Character.isLetter( ch )) ++letterCount; 077 078 softbreak: 079 { 080 if( isLast ) break softbreak; // not at very end, where it breaks anyway 081 082 if( letterCount < 3 ) break softbreak; 083 // Not on "1. Title" or ":: indented text". Rather wait for 084 // a few letters to appear. But not too large a count, or a 085 // small insertion might (in some cases) effect a cascade 086 // across multiple lines. 087 088 if( ch == ' ' ) // on space, depending on adjacent chars 089 { 090 assert !isLast; 091 char chNext = line.charAt( c + 1 ); 092 if( chNext == '<' || chNext == '[' ) toSoftbreak = true; // start tag or or start of link 093 else if( chPrior == '>' || chPrior == ']' // end tag or end of link 094 || chPrior == '-' 095 || chPrior == '\u2014' // em dash 096 || chPrior != '/' // don't break "[http://etc/path/ link]" 097 && Character.getType(chPrior) == Character.OTHER_PUNCTUATION ) 098 // .,;:?! and the like, per http://www.fileformat.info/info/unicode/category/Po/list.htm 099 { 100 while( chNext == ' ' ) // gobble contiguous spaces 101 { 102 out.write( ch ); 103 ++c; 104 isLast = c == cLast; 105 if( isLast ) break; 106 107 chNext = line.charAt( c + 1 ); 108 } 109 110 toSoftbreak = !isLast; 111 } 112 } 113 else if( Character.getType(ch) == Character.OTHER_PUNCTUATION ) 114 { 115 assert !isLast; 116 final int cNext = c + 1; 117 final char chNext = line.charAt( cNext ); 118 if( chNext == '[' ) toSoftbreak = true; // "end of phrase,[ref link]" 119 else if( chNext == '<' && cNext < cLast ) 120 { 121 final char chNextNext = line.charAt( cNext + 1 ); 122 toSoftbreak = chNextNext != '/'; 123 // so "end of phrase,<ref>", but not "end of ref.</ref>" 124 } 125 } 126 } 127 } 128 129 out.write( ch ); 130 if( isLast ) for( int s = 0; s < trailingSoftbreakCount; ++s ) 131 { 132 out.write( SOFTBREAK_CHAR ); // byte-stuff 133 } 134 135 if( toSoftbreak ) 136 { 137 out.write( SOFTBREAK_CHAR ); // indicator 138 out.write( '\n' ); // actual break 139 } 140 141 if( isLast ) break; 142 } 143 } 144 out.write( '\n' ); 145 } 146 147 148 149 /** Appends a line of outgoing text, destined for the wiki. This undoes the 150 * incoming transformation. 151 */ 152 public void appendToWiki( final String line, final Writer out ) throws IOException 153 { 154 // cf. votorola.s.gwt.mediawiki.SegmentPatternBuilder 155 156 int lineLength = line.length(); 157 int trailingSoftbreakCount = 0; 158 for( int c = lineLength - 1; c >= 0; --c ) 159 { 160 if( line.charAt(c) != SOFTBREAK_CHAR ) break; 161 162 ++trailingSoftbreakCount; 163 } 164 165 final boolean toBreak; 166 if( IntegerX.isOdd( trailingSoftbreakCount )) // then we inserted a softbreak 167 { 168 --lineLength; // eat the indicator 169 toBreak = false; // eat the break 170 } 171 else toBreak = true; 172 173 if( trailingSoftbreakCount > 1 ) lineLength -= trailingSoftbreakCount / 2; // un-byte-stuff 174 175 out.write( line, 0, lineLength ); 176 if( toBreak ) out.write( '\n' ); 177 } 178 179 180 181 /** The pattern of a "low bracket" or "high bracket" marker in a line of wikitext. 182 * Group (1) contains the bracket type. If it equals "Hi", then it marks the last 183 * line of the "high bracket". The high bracket (basically a header) includes all 184 * lines from the start of the page to this line. If it equals "Lo", then it marks 185 * the first line of the "low bracket". The low bracket (a footer) includes all 186 * lines form this one to the end of the page. All lines of the high and low 187 * brackets are ignored for differencing purposes. 188 */ 189 static final Pattern BRAC_LINE_MARKER_PATTERN = Pattern.compile( 190 "<!--.*\\bvo(Hi|Lo)Brac\\b.*-->" ); 191 192 193 194 /** Downloads the wikitext source of the specified page, and splits it into one or 195 * more temporary files. 196 * 197 * @param idType one of "curid" or "oldid". 198 * @param id the page identifier (curid) or revision identifier (oldid). 199 * @param s the base URI for script execution in the wiki, without a trailing 200 * slash (/). 201 * @param prefix the base {@linkplain File#createTempFile(String,String) prefix} 202 * for the temporary files. 203 * @param fileSpool the spool to hold temporary files. If unwound, the files 204 * will be deleted. 205 * 206 * @return an array of up to three files: [0] a file containing the transformed 207 * content of the high bracket up to and including BRAC_LINE_MARKER_PATTERN 208 * (Hi), or null if there is none; [1] a file containing the transformed main 209 * body of the source text; and [2] a file containing the untransformed low 210 * bracket, up to and including BRAC_LINE_MARKER_PATTERN (Lo), or null if there 211 * is none. Note that only the low bracket file is untransformed. 212 * 213 * @see <a href='http://www.mediawiki.org/wiki/Manual:Parameters_to_index.php' 214 * >www.mediawiki.org/wiki/Manual:Parameters_to_index.php</a> 215 */ 216 public File[] fetchPageAsFile( final URI s, final String idType, final int id, 217 final String prefix, final Spool fileSpool ) throws IOException 218 { 219 final HttpURLConnection http; 220 try 221 { 222 final URI uri = new URI( s + "/index.php?action=raw&" + idType + "=" + id ); 223 LoggerX.i(LineTransformer1.class).fine( "querying wiki " + uri ); 224 http = (HttpURLConnection)( uri.toURL().openConnection() ); 225 } 226 catch( URISyntaxException x ) { throw new RuntimeException( x ); } 227 228 final File file = File.createTempFile( prefix, "." + id ); 229 fileSpool.add( new FileHold( file )); 230 File fileHi = null; 231 File fileLo = null; 232 URLConnectionX.connect( http ); 233 try 234 { 235 final BufferedReader in = new BufferedReader( new InputStreamReader( 236 http.getInputStream(), "UTF-8" )); // assuming UTF-8, maybe FIX by reading the HTTP header 237 try 238 { 239 final Charset nativeCharset = Charset.defaultCharset(); 240 BufferedWriter out = new BufferedWriter( new OutputStreamWriter( 241 new FileOutputStream(file), nativeCharset )); 242 fetch: try 243 { 244 String l; 245 246 // High bracket and main body 247 // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 248 for( ;; ) 249 { 250 l = in.readLine(); 251 if( l == null ) break fetch; 252 253 final Matcher m = BRAC_LINE_MARKER_PATTERN.matcher( l ); 254 final String position = m.find()? m.group(1): null; 255 if( "Lo".equals( position )) break; 256 257 appendFromWiki( l, out ); // transform 258 if( position != null ) 259 { 260 assert "Hi".equals( position ); 261 out.close(); 262 263 fileHi = File.createTempFile( prefix + "Hi", "." + id ); 264 fileSpool.add( new FileHold( fileHi )); 265 FileX.copyAs( fileHi, file ); 266 out = new BufferedWriter( new OutputStreamWriter( 267 new FileOutputStream(file), nativeCharset )); 268 } 269 } 270 out.close(); 271 272 // Low bracket 273 // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 274 fileLo = File.createTempFile( prefix + "Lo", "." + id ); 275 fileSpool.add( new FileHold( fileLo )); 276 out = new BufferedWriter( new OutputStreamWriter( 277 new FileOutputStream(fileLo), nativeCharset )); 278 for( ;; ) 279 { 280 out.write( l ); 281 out.newLine(); 282 283 l = in.readLine(); 284 if( l == null ) break; 285 } 286 } 287 finally{ out.close(); } 288 } 289 finally{ in.close(); } 290 } 291 finally{ http.disconnect(); } 292 293 return new File[] { fileHi, file, fileLo }; 294 } 295 296 297 298 /** Runs a loopback test of this line transformer. 299 */ 300 void test() throws IOException 301 { 302 // Original 303 // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 304 final String originalText; 305 { 306 final StringBuilder sB = new StringBuilder(); 307 sB.append( "This is line 1, which has a comma.\n" ); 308 sB.append( "This is line 2 and it has no punctuation.\n" ); 309 sB.append( "\n" ); 310 sB.append( "\n" ); 311 sB.append( "The two preceding lines were empty.\n" ); 312 sB.append( "This sentence is broken\n" ); 313 sB.append( "across two lines\n" ); 314 sB.append( "* This is bulleted.\n" ); 315 sB.append( ": This is indented.\n" ); 316 sB.append( ":: This is double indented.\n" ); 317 sB.append( "A. This is a section title\n" ); 318 sB.append( "This line ends with a reference.<ref>This is a reference.</ref>\n" ); 319 sB.append( "This line ends with a reference link.[This is a reference link.]\n" ); 320 sB.append( "This line has a reference,<ref>This is a reference.</ref> inside.\n" ); 321 sB.append( "This line has a reference link.[This is a reference link.] inside.\n" ); 322 sB.append( "Here is a <em>tagged phrase</em> in mid-line.\n" ); 323 sB.append( "This line ends with a space. \n" ); 324 sB.append( " This begins with a space.\n" ); 325 sB.append( "This ends with a softbreak." ).append( 326 LineTransformer1.SOFTBREAK_CHAR ).append( '\n' ); 327 sB.append( "A softbreak " ).append( LineTransformer1.SOFTBREAK_CHAR ).append( 328 " is contained here.\n" ); 329 sB.append( "Hello. These two sentences are separated by a double space.\n" ); 330 sB.append( "This,line,has,,.punctuation!but?no.spaces.\n" ); 331 sB.append( "This comma, has an extra space.\n" ); // end text with newline, else test will fail per LineTransformer1 332 originalText = sB.toString(); 333 } 334 System.out.println( "originalText" ); 335 System.out.println( "------------" ); 336 System.out.println( originalText ); 337 System.out.println( "--- end\n" ); 338 339 // Transformed in 340 // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 341 final String transformedInText; 342 { 343 final BufferedReader in = new BufferedReader( new StringReader( originalText )); 344 try 345 { 346 final StringWriter outS = new StringWriter(); 347 final BufferedWriter out = new BufferedWriter( outS ); 348 try 349 { 350 for( ;; ) 351 { 352 final String l = in.readLine(); 353 if( l == null ) break; 354 355 appendFromWiki( l, out ); 356 } 357 } 358 finally{ out.close(); } 359 360 transformedInText = outS.toString(); 361 System.out.println( "transformed in" ); 362 System.out.println( "--------------" ); 363 System.out.println( transformedInText ); 364 System.out.println( "--- end\n" ); 365 } 366 finally{ in.close(); } 367 } 368 369 // Transformed back out 370 // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 371 final String transformedOutText; 372 { 373 final BufferedReader in = new BufferedReader( new StringReader( transformedInText )); 374 try 375 { 376 final StringWriter outS = new StringWriter(); 377 final BufferedWriter out = new BufferedWriter( outS ); 378 try 379 { 380 for( ;; ) 381 { 382 final String l = in.readLine(); 383 if( l == null ) break; 384 385 appendToWiki( l, out ); 386 } 387 } 388 finally{ out.close(); } 389 390 transformedOutText = outS.toString(); 391 System.out.println( "transformed out" ); 392 System.out.println( "---------------" ); 393 System.out.println( transformedOutText ); 394 System.out.println( "--- end\n" ); 395 } 396 finally{ in.close(); } 397 } 398 399 System.out.println( "transformedOutText.equals(originalText) = " 400 + transformedOutText.equals(originalText) ); 401 } 402 403 404 405//// P r i v a t e /////////////////////////////////////////////////////////////////////// 406 407 408 private static final char SOFTBREAK_CHAR = '\u21a9'; 409 410 411 412}