package votorola.s.gwt.mediawiki; // Copyright 2011-2012, Michael Allan. Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Votorola Software"), to deal in the Votorola Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicence, and/or sell copies of the Votorola Software, and to permit persons to whom the Votorola Software is furnished to do so, subject to the following conditions: The preceding copyright notice and this permission notice shall be included in all copies or substantial portions of the Votorola Software. THE VOTOROLA SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE VOTOROLA SOFTWARE OR THE USE OR OTHER DEALINGS IN THE VOTOROLA SOFTWARE. import com.google.gwt.regexp.shared.*; import votorola.g.lang.*; import votorola.g.web.gwt.*; /** A constructor of regular expression patterns for the purpose of searching for * difference segments in the wikitext of a draft page. */ final class SegmentPatternBuilder { /** Creates a SegmentPatternBuilder. */ SegmentPatternBuilder( DifferenceShadowsV _view, DifferenceParse _p ) { view = _view; p = _p; } // ------------------------------------------------------------------------------------ /** Returns the search pattern for the next segment of the diff commencing at * p.diffText.charAt( p.d ). The pattern includes only those contiguous * lines that start with the specified prefix. Pointer p.d is left at the start of * the next segment, or at p.diffText.length if there is none. * * @param prefix the line prefix of the diff segment. This is either '-' or '+' * for a hunk segment, or space ' ' for a context segment. * @return the pattern, which may be the empty string '' if the current line does * not begin with the specified prefix. * * @throws Warning if a failure is detected. */ String nextPattern( final char prefix ) throws DifferenceShadowsV.Warning { if( view.rep() > 0 ) view.report( p.d + " --- nextDiffPattern '" + prefix + "'", 3, p.diff ); StringBuilderX.clear( b ); final String text = p.diffText; if( p.d != 0 && text.charAt(p.d-1) != '\n' ) { throw new DifferenceShadowsV.Warning( "Expected line start but not found: " + p.positionMessage() ); } boolean toExpectPrefix = true; // and force last to newline for sake of blank line detection: char chPipe = '\n'; // last char processed, exclusive of prefix and artificial newlines int chPipeCount = 1; // repetition count for chPipe segment: for( int dN = text.length(); p.d < dN ; ++p.d ) { char ch = text.charAt( p.d ); // Prefix at start of line // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - if( toExpectPrefix ) { if( ch != prefix ) break segment; // end of segment toExpectPrefix = false; continue segment; // eat the line prefix } // Piping from previous loops // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - boolean isArtificialNewline = false; // till proven otherwise if( ch != chPipe ) // change of character { // Softbreaks // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - if( chPipe == SOFTBREAK_CHAR ) { // Artificial newlines, cf. a.diff.LineTransformer1.appendToWiki() // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - if( ch == '\n' ) { if( IntegerX.isOdd( chPipeCount )) { isArtificialNewline = true; --chPipeCount; // eat softbreak inserted as artificial newline indicator } chPipeCount /= 2; // eat byte stuffing } for( int c = 0; c < chPipeCount; ++c ) b.append( chPipe ); // unpipe } } // Note: if isHangingBracketMystery and isMagicWordish are both coded inline // below (even with no other code) then GWT compiler hangs (2.4.0+mca.2). try { // Hanging bracket mysteries // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - if( isHangingBracketMystery( ch )) continue segment; // HTML <..> // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - if( ch == '<' ) // see also isHangingBracketMystery above { int t = p.d; if( text.charAt(++t) == 'r' && text.charAt(++t) == 'e' && text.charAt(++t) == 'f' ) { final char tch = text.charAt( ++t ); // Inline references .. // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - if( tch == '>' ) { b.append( "\\[\\d+\\]" ); // reference link body for( ;; ) { ++t; final String line = p.subline( t ); WIKI_REF_END_PATTERN.setLastIndex( 0 ); if( WIKI_REF_END_PATTERN.exec(line) != null ) break; t += line.length(); // to the newline, if any ++t; // to the prefix, if any if( t >= dN || text.charAt(t) != prefix ) { p.d = t; break segment; // no closing tag in this segment, done with it } } p.d = t + WIKI_REF_END_PATTERN.getLastIndex() - 1; // last char, i.e. '>' ch = text.charAt( p.d ); continue segment; } // Reference listing // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - else if( tch == 'e' && text.charAt(++t) == 'r' && text.charAt(++t) == 'e' && text.charAt(++t) == 'n' && text.charAt(++t) == 'c' && text.charAt(++t) == 'e' && text.charAt(++t) == 's' && text.charAt(++t) == '/' && text.charAt(++t) == '>' ) { b.append( ".*?" ); // whatever it generates for this p.d = t; ch = '>'; continue segment; } } // - - - ch = skipToClosingBracket( '<', '>', prefix ); continue segment; // eat it } if( ch == '>' ) // unmatched open bracket(s) { skipToLast( ch ); StringBuilderX.clear( b ); // clear the buffer, it was all HTML continue segment; } // Italics ''..'' and bold '''..''' // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - if( ch == '\'' && skipToLast(ch) > 0 ) continue segment; // eat them // Line formatting // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - if( chPipe == '\n' ) // start of line { // Lists * | # | : and non-TOC headings ; at start of line // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - int tEnd = p.d; char tch = ch; while( tch == '*' || tch == '#' || tch == ':' || tch == ';' ) { ++tEnd; if( tEnd >= dN ) break; tch = text.charAt( tEnd ); ch = tch; // just to be correct } if( tEnd > p.d ) { p.d = tEnd - 1; // skip to last // if( skipToLast( ' ' )) ch = ' '; // eat any spaces after it //// no, they are rendered continue segment; } // Preformatted text, leading space // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - if( ch == ' ' ) continue segment; // eat the leading space // see also note in newline } // Links // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - if( ch == '[' ) { int t = p.d + 1; final char tch = text.charAt( t ); ch = skipToClosingBracket( '[', ']', prefix ); int tEnd = p.d + 1; // Internal link [[..]] // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - if( tch == '[' ) // two of them { ++t; // start of body if( ch == ']' ) { --tEnd; // before ultimate ] if( text.charAt(p.d-1) == ']' ) --tEnd; // before any second ] } final int t2 = text.indexOf( '|', t ); if( t2 != -1 && t2 < tEnd ) t = t2 + 1; // body specification if( t < tEnd ) RegExpX.appendQuoted( b, text, t, tEnd ); } // External link [..] // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - else { if( ch == ']' ) --tEnd; final int t2 = text.indexOf( ' ', t ); if( t2 != -1 && t2 < tEnd ) { t = t2 + 1; // body specification if( t < tEnd ) RegExpX.appendQuoted( b, text, t, tEnd ); } else b.append( "\\[[0-9]+\\]" ); // no body specified (never tested), collapses to [N] } continue segment; } // see also isHangingBracketMystery above // Newlines // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - if( ch == '\n' ) { toExpectPrefix = true; if( isArtificialNewline ) ch = 0; // for sake of chPipeCount below, ignore it else if( chPipe != '\n' ) b.append( "\n*" ); // Normal line ending. Allow multiple newlines to accomodate a // single edge case in which MediaWiki injects an extra newline when // it inserts a closing tag. Allow zero newlines because // sometimes MediaWiki removes a newline, e.g. after closing // brackets of a template call. // else blank line, the repetition \n* should cover it continue segment; } // Magic words __..__ and plain underscores // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - if( isMagicWordish( ch )) continue segment; // Piped characters, for later processing // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - if( ch == SOFTBREAK_CHAR ) continue segment; // Section headlines =..= | ==..== | ===..=== | ... // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - if( ch == '=' ) { if( chPipe == '\n' ) // start of line, definitely a headline { b.append( "(?:\\[\\S+\\])?" ); // allow preceding "[edit]" link b.append( " *" ); // a single space always precedes the content } else // maybe trailing headline bracket or just '=' signs, not sure { b.append( "\\=*" ); // allow them } skipToLast( ch ); // take all of them continue segment; } // Template or parser call {{..}} or formal template parameter {{{..}}} // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - if( ch == '{' ) // assume for now it is actually {{ or {{{ { ch = skipToClosingBracket( '{', '}', prefix ); b.append( ".*?" ); // might be rendered as anything continue segment; } // see also isHangingBracketMystery above, and note about "newlines" // removed in template calls // Default: ordinary, literal characters // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - RegExpX.appendQuoted( b, ch ); } finally { if( ch != 0 ) { if( ch == chPipe ) ++chPipeCount; else { chPipe = ch; chPipeCount = 1; } } } } return b.toString(); } //// P r i v a t e /////////////////////////////////////////////////////////////////////// private final StringBuilder b = new StringBuilder( /*initial capacity*/1000 ); private boolean isHangingBracketMystery( final char ch ) { if( ch == '}' || ch == ']' ) // unmatched open bracket(s) { skipToLast( ch ); StringBuilderX.clear( b ); // clear the buffer, it was all a mystery // b.append( ".*?" ); // might be rendered as anything //// but that would overreach at the head of the pattern [GH] return true; } return false; } private boolean isHTML( final char ch ) { if( ch == '}' || ch == ']' ) // unmatched open bracket(s) { skipToLast( ch ); StringBuilderX.clear( b ); // clear the buffer, it was all a mystery // b.append( ".*?" ); // might be rendered as anything //// but that would overreach at the head of the pattern [GH] return true; } return false; } private boolean isMagicWordish( final char ch ) { if( ch != '_' ) return false; final String text = p.diffText; int t = p.d; if( text.charAt(++t) == '_' ) { int m = t; // start of possible magic word for( ;; ) { ++t; final int tch = text.charAt( t ); if( tch == '_' ) { if( text.charAt(++t) != '_' ) break; // not a magic word if( t - m == 3 + 1 ) // if it's 3 characters long { if( text.charAt(m++) == 'T' && text.charAt(m++) == 'O' && text.charAt(m) == 'C' ) { b.append( ".+?" ); // might be rendered as anything (never tested) } } // else it's a magic word that doesn't render, so just eat it p.d = t; return true; } if( tch < 'A' || tch > 'Z' ) break; // not a magic word } } b.append( ch ); return true; // actually just an underscore, but safely done with it } private final DifferenceParse p; /** Advances cursor p.d to the closing bracket that matches the specified opening * bracket, if possible. Allows for bracket nesting. * * @param prefix the line prefix of the current diff segment. The cursor will * not be advanced onto a newline character unless the next line has the same * prefix. * @return the character advanced to, which might not be a closing bracket. * * @throws AssertionError if the current character is not the specified * openingBracket. */ private char skipToClosingBracket( final char openingBracket, final char closingBracket, final char prefix ) { final int dN = p.diffText.length(); int t = p.d; assert p.diffText.charAt(t) == openingBracket; char ch = openingBracket; for( int bracketCount = 1;; ) // find closing bracket { ++t; if( t >= dN ) { p.d = t - 1; // last char break; } final char tch = p.diffText.charAt( t ); if( tch == '\n' ) { if( p.diffText.charAt(t+1) != prefix ) { p.d = t - 1; // back up prior to the newline break; } } else { ch = tch; if( ch == openingBracket ) ++bracketCount; else if( ch == closingBracket ) { --bracketCount; if( bracketCount == 0 ) { p.d = t; break; } } } } return ch; } /** If p.d + 1 matches the specified character, then cursor p.d is * advanced to the last contiguous character that also matches. * * @return count of characters that match, which might be zero. */ private int skipToLast( final char ch ) { final int dN = p.diffText.length(); int tEnd = p.d + 1; while( tEnd < dN && p.diffText.charAt(tEnd) == ch ) ++tEnd; final int count = tEnd - p.d - 1; if( count > 0 ) p.d = tEnd - 1; // skip to last return count; } private static final char SOFTBREAK_CHAR = '\u21a9'; // per a.diff.LineTransformer1 private final DifferenceShadowsV view; /** The global pattern of a refence end tag in wikitext. Be sure to reset the * {@linkplain RegExp#getLastIndex() last index} before using it. */ private static final RegExp WIKI_REF_END_PATTERN = RegExp.compile( "", "g" ); }