001package votorola.s.gwt.mediawiki; // Copyright 2011-2012, Michael Allan. Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Votorola Software"), to deal in the Votorola Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicence, and/or sell copies of the Votorola Software, and to permit persons to whom the Votorola Software is furnished to do so, subject to the following conditions: The preceding copyright notice and this permission notice shall be included in all copies or substantial portions of the Votorola Software. THE VOTOROLA SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE VOTOROLA SOFTWARE OR THE USE OR OTHER DEALINGS IN THE VOTOROLA SOFTWARE. 002 003import com.google.gwt.regexp.shared.*; 004import votorola.g.lang.*; 005import votorola.g.web.gwt.*; 006 007 008/** A constructor of regular expression patterns for the purpose of searching for 009 * difference segments in the wikitext of a draft page. 010 */ 011final class SegmentPatternBuilder 012{ 013 014 015 /** Creates a SegmentPatternBuilder. 016 */ 017 SegmentPatternBuilder( DifferenceShadowsV _view, DifferenceParse _p ) 018 { 019 view = _view; 020 p = _p; 021 } 022 023 024 025 // ------------------------------------------------------------------------------------ 026 027 028 /** Returns the search pattern for the next segment of the diff commencing at 029 * <code>p.diffText.charAt( p.d )</code>. The pattern includes only those contiguous 030 * lines that start with the specified prefix. Pointer p.d is left at the start of 031 * the next segment, or at p.diffText.length if there is none. 032 * 033 * @param prefix the line prefix of the diff segment. This is either '-' or '+' 034 * for a hunk segment, or space ' ' for a context segment. 035 * @return the pattern, which may be the empty string '' if the current line does 036 * not begin with the specified prefix. 037 * 038 * @throws Warning if a failure is detected. 039 */ 040 String nextPattern( final char prefix ) throws DifferenceShadowsV.Warning 041 { 042 if( view.rep() > 0 ) view.report( p.d + " --- nextDiffPattern '" + prefix + "'", 3, p.diff ); 043 StringBuilderX.clear( b ); 044 final String text = p.diffText; 045 if( p.d != 0 && text.charAt(p.d-1) != '\n' ) 046 { 047 throw new DifferenceShadowsV.Warning( "Expected line start but not found: " 048 + p.positionMessage() ); 049 } 050 051 boolean toExpectPrefix = true; // and force last to newline for sake of blank line detection: 052 char chPipe = '\n'; // last char processed, exclusive of prefix and artificial newlines 053 int chPipeCount = 1; // repetition count for chPipe 054 segment: for( int dN = text.length(); p.d < dN ; ++p.d ) 055 { 056 char ch = text.charAt( p.d ); 057 058 // Prefix at start of line 059 // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 060 if( toExpectPrefix ) 061 { 062 if( ch != prefix ) break segment; // end of segment 063 064 toExpectPrefix = false; 065 continue segment; // eat the line prefix 066 } 067 068 // Piping from previous loops 069 // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 070 boolean isArtificialNewline = false; // till proven otherwise 071 if( ch != chPipe ) // change of character 072 { 073 // Softbreaks 074 // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 075 if( chPipe == SOFTBREAK_CHAR ) 076 { 077 // Artificial newlines, cf. a.diff.LineTransformer1.appendToWiki() 078 // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 079 if( ch == '\n' ) 080 { 081 if( IntegerX.isOdd( chPipeCount )) 082 { 083 isArtificialNewline = true; 084 --chPipeCount; // eat softbreak inserted as artificial newline indicator 085 } 086 chPipeCount /= 2; // eat byte stuffing 087 } 088 for( int c = 0; c < chPipeCount; ++c ) b.append( chPipe ); // unpipe 089 } 090 } 091 092 // Note: if isHangingBracketMystery and isMagicWordish are both coded inline 093 // below (even with no other code) then GWT compiler hangs (2.4.0+mca.2). 094 try 095 { 096 // Hanging bracket mysteries 097 // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 098 if( isHangingBracketMystery( ch )) continue segment; 099 100 // HTML <..> 101 // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 102 if( ch == '<' ) // see also isHangingBracketMystery above 103 { 104 int t = p.d; 105 if( text.charAt(++t) == 'r' 106 && text.charAt(++t) == 'e' 107 && text.charAt(++t) == 'f' ) 108 { 109 final char tch = text.charAt( ++t ); 110 111 // Inline references <ref>..</ref> 112 // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 113 if( tch == '>' ) 114 { 115 b.append( "\\[\\d+\\]" ); // reference link body 116 for( ;; ) 117 { 118 ++t; 119 final String line = p.subline( t ); 120 WIKI_REF_END_PATTERN.setLastIndex( 0 ); 121 if( WIKI_REF_END_PATTERN.exec(line) != null ) break; 122 123 t += line.length(); // to the newline, if any 124 ++t; // to the prefix, if any 125 if( t >= dN || text.charAt(t) != prefix ) 126 { 127 p.d = t; 128 break segment; // no closing tag in this segment, done with it 129 } 130 } 131 132 p.d = t + WIKI_REF_END_PATTERN.getLastIndex() - 1; // last char, i.e. '>' 133 ch = text.charAt( p.d ); 134 continue segment; 135 } 136 137 // Reference listing <references/> 138 // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 139 else if( tch == 'e' 140 && text.charAt(++t) == 'r' 141 && text.charAt(++t) == 'e' 142 && text.charAt(++t) == 'n' 143 && text.charAt(++t) == 'c' 144 && text.charAt(++t) == 'e' 145 && text.charAt(++t) == 's' 146 && text.charAt(++t) == '/' 147 && text.charAt(++t) == '>' ) 148 { 149 b.append( ".*?" ); // whatever it generates for this 150 p.d = t; 151 ch = '>'; 152 continue segment; 153 } 154 } 155 156 // - - - 157 ch = skipToClosingBracket( '<', '>', prefix ); 158 continue segment; // eat it 159 } 160 161 if( ch == '>' ) // unmatched open bracket(s) 162 { 163 skipToLast( ch ); 164 StringBuilderX.clear( b ); // clear the buffer, it was all HTML 165 continue segment; 166 } 167 168 // Italics ''..'' and bold '''..''' 169 // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 170 if( ch == '\'' && skipToLast(ch) > 0 ) continue segment; // eat them 171 172 // Line formatting 173 // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 174 if( chPipe == '\n' ) // start of line 175 { 176 // Lists * | # | : and non-TOC headings ; at start of line 177 // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 178 int tEnd = p.d; 179 char tch = ch; 180 while( tch == '*' || tch == '#' || tch == ':' || tch == ';' ) 181 { 182 ++tEnd; 183 if( tEnd >= dN ) break; 184 185 tch = text.charAt( tEnd ); 186 ch = tch; // just to be correct 187 } 188 if( tEnd > p.d ) 189 { 190 p.d = tEnd - 1; // skip to last 191 // if( skipToLast( ' ' )) ch = ' '; // eat any spaces after it 192 //// no, they are rendered 193 continue segment; 194 } 195 196 // Preformatted text, leading space 197 // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 198 if( ch == ' ' ) continue segment; // eat the leading space 199 // see also </pre> note in newline 200 } 201 202 // Links 203 // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 204 if( ch == '[' ) 205 { 206 int t = p.d + 1; 207 final char tch = text.charAt( t ); 208 ch = skipToClosingBracket( '[', ']', prefix ); 209 int tEnd = p.d + 1; 210 211 // Internal link [[..]] 212 // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 213 if( tch == '[' ) // two of them 214 { 215 ++t; // start of body 216 if( ch == ']' ) 217 { 218 --tEnd; // before ultimate ] 219 if( text.charAt(p.d-1) == ']' ) --tEnd; // before any second ] 220 } 221 final int t2 = text.indexOf( '|', t ); 222 if( t2 != -1 && t2 < tEnd ) t = t2 + 1; // body specification 223 if( t < tEnd ) RegExpX.appendQuoted( b, text, t, tEnd ); 224 } 225 226 // External link [..] 227 // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 228 else 229 { 230 if( ch == ']' ) --tEnd; 231 final int t2 = text.indexOf( ' ', t ); 232 if( t2 != -1 && t2 < tEnd ) 233 { 234 t = t2 + 1; // body specification 235 if( t < tEnd ) RegExpX.appendQuoted( b, text, t, tEnd ); 236 } 237 else b.append( "\\[[0-9]+\\]" ); 238 // no body specified (never tested), collapses to [N] 239 } 240 continue segment; 241 } 242 // see also isHangingBracketMystery above 243 244 // Newlines 245 // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 246 if( ch == '\n' ) 247 { 248 toExpectPrefix = true; 249 if( isArtificialNewline ) ch = 0; // for sake of chPipeCount below, ignore it 250 else if( chPipe != '\n' ) b.append( "\n*" ); 251 // Normal line ending. Allow multiple newlines to accomodate a 252 // single edge case in which MediaWiki injects an extra newline when 253 // it inserts a </pre> closing tag. Allow zero newlines because 254 // sometimes MediaWiki removes a newline, e.g. after closing 255 // brackets of a template call. 256 // else blank line, the repetition \n* should cover it 257 continue segment; 258 } 259 260 // Magic words __..__ and plain underscores 261 // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 262 if( isMagicWordish( ch )) continue segment; 263 264 // Piped characters, for later processing 265 // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 266 if( ch == SOFTBREAK_CHAR ) continue segment; 267 268 // Section headlines =..= | ==..== | ===..=== | ... 269 // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 270 if( ch == '=' ) 271 { 272 if( chPipe == '\n' ) // start of line, definitely a headline 273 { 274 b.append( "(?:\\[\\S+\\])?" ); // allow preceding "[edit]" link 275 b.append( " *" ); // a single space always precedes the content 276 } 277 else // maybe trailing headline bracket or just '=' signs, not sure 278 { 279 b.append( "\\=*" ); // allow them 280 } 281 282 skipToLast( ch ); // take all of them 283 continue segment; 284 } 285 286 // Template or parser call {{..}} or formal template parameter {{{..}}} 287 // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 288 if( ch == '{' ) // assume for now it is actually {{ or {{{ 289 { 290 ch = skipToClosingBracket( '{', '}', prefix ); 291 b.append( ".*?" ); // might be rendered as anything 292 continue segment; 293 } 294 // see also isHangingBracketMystery above, and note about "newlines" 295 // removed in template calls 296 297 // Default: ordinary, literal characters 298 // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 299 RegExpX.appendQuoted( b, ch ); 300 } 301 finally 302 { 303 if( ch != 0 ) 304 { 305 if( ch == chPipe ) ++chPipeCount; 306 else 307 { 308 chPipe = ch; 309 chPipeCount = 1; 310 } 311 } 312 } 313 } 314 return b.toString(); 315 } 316 317 318 319//// P r i v a t e /////////////////////////////////////////////////////////////////////// 320 321 322 private final StringBuilder b = new StringBuilder( /*initial capacity*/1000 ); 323 324 325 326 private boolean isHangingBracketMystery( final char ch ) 327 { 328 if( ch == '}' || ch == ']' ) // unmatched open bracket(s) 329 { 330 skipToLast( ch ); 331 StringBuilderX.clear( b ); // clear the buffer, it was all a mystery 332 // b.append( ".*?" ); // might be rendered as anything 333 //// but that would overreach at the head of the pattern [GH] 334 return true; 335 } 336 337 return false; 338 } 339 340 341 342 private boolean isHTML( final char ch ) 343 { 344 if( ch == '}' || ch == ']' ) // unmatched open bracket(s) 345 { 346 skipToLast( ch ); 347 StringBuilderX.clear( b ); // clear the buffer, it was all a mystery 348 // b.append( ".*?" ); // might be rendered as anything 349 //// but that would overreach at the head of the pattern [GH] 350 return true; 351 } 352 353 return false; 354 } 355 356 357 358 private boolean isMagicWordish( final char ch ) 359 { 360 if( ch != '_' ) return false; 361 362 final String text = p.diffText; 363 int t = p.d; 364 if( text.charAt(++t) == '_' ) 365 { 366 int m = t; // start of possible magic word 367 for( ;; ) 368 { 369 ++t; 370 final int tch = text.charAt( t ); 371 if( tch == '_' ) 372 { 373 if( text.charAt(++t) != '_' ) break; // not a magic word 374 375 if( t - m == 3 + 1 ) // if it's 3 characters long 376 { 377 if( text.charAt(m++) == 'T' 378 && text.charAt(m++) == 'O' 379 && text.charAt(m) == 'C' ) 380 { 381 b.append( ".+?" ); 382 // might be rendered as anything (never tested) 383 } 384 } 385 // else it's a magic word that doesn't render, so just eat it 386 p.d = t; 387 return true; 388 } 389 390 if( tch < 'A' || tch > 'Z' ) break; // not a magic word 391 } 392 } 393 b.append( ch ); 394 return true; // actually just an underscore, but safely done with it 395 } 396 397 398 399 private final DifferenceParse p; 400 401 402 403 /** Advances cursor p.d to the closing bracket that matches the specified opening 404 * bracket, if possible. Allows for bracket nesting. 405 * 406 * @param prefix the line prefix of the current diff segment. The cursor will 407 * not be advanced onto a newline character unless the next line has the same 408 * prefix. 409 * @return the character advanced to, which might not be a closing bracket. 410 * 411 * @throws AssertionError if the current character is not the specified 412 * openingBracket. 413 */ 414 private char skipToClosingBracket( final char openingBracket, final char closingBracket, 415 final char prefix ) 416 { 417 final int dN = p.diffText.length(); 418 int t = p.d; 419 assert p.diffText.charAt(t) == openingBracket; 420 char ch = openingBracket; 421 for( int bracketCount = 1;; ) // find closing bracket 422 { 423 ++t; 424 if( t >= dN ) 425 { 426 p.d = t - 1; // last char 427 break; 428 } 429 430 final char tch = p.diffText.charAt( t ); 431 if( tch == '\n' ) 432 { 433 if( p.diffText.charAt(t+1) != prefix ) 434 { 435 p.d = t - 1; // back up prior to the newline 436 break; 437 } 438 } 439 else 440 { 441 ch = tch; 442 if( ch == openingBracket ) ++bracketCount; 443 else if( ch == closingBracket ) 444 { 445 --bracketCount; 446 if( bracketCount == 0 ) 447 { 448 p.d = t; 449 break; 450 } 451 } 452 } 453 } 454 return ch; 455 } 456 457 458 459 /** If <code>p.d + 1</code> matches the specified character, then cursor p.d is 460 * advanced to the last contiguous character that also matches. 461 * 462 * @return count of characters that match, which might be zero. 463 */ 464 private int skipToLast( final char ch ) 465 { 466 final int dN = p.diffText.length(); 467 int tEnd = p.d + 1; 468 while( tEnd < dN && p.diffText.charAt(tEnd) == ch ) ++tEnd; 469 final int count = tEnd - p.d - 1; 470 if( count > 0 ) p.d = tEnd - 1; // skip to last 471 return count; 472 } 473 474 475 476 private static final char SOFTBREAK_CHAR = '\u21a9'; // per a.diff.LineTransformer1 477 478 479 480 private final DifferenceShadowsV view; 481 482 483 /** The global pattern of a refence end tag in wikitext. Be sure to reset the 484 * {@linkplain RegExp#getLastIndex() last index} before using it. 485 */ 486 private static final RegExp WIKI_REF_END_PATTERN = RegExp.compile( "</ref>", "g" ); 487 488 489 490}