Source code

001package votorola.s.gwt.mediawiki; // Copyright 2011-2012, Michael Allan.  Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Votorola Software"), to deal in the Votorola Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicence, and/or sell copies of the Votorola Software, and to permit persons to whom the Votorola Software is furnished to do so, subject to the following conditions: The preceding copyright notice and this permission notice shall be included in all copies or substantial portions of the Votorola Software. THE VOTOROLA SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE VOTOROLA SOFTWARE OR THE USE OR OTHER DEALINGS IN THE VOTOROLA SOFTWARE.
002
003import com.google.gwt.regexp.shared.*;
004import votorola.g.lang.*;
005import votorola.g.web.gwt.*;
006
007
008/** A constructor of regular expression patterns for the purpose of searching for
009  * difference segments in the wikitext of a draft page.
010  */
011final class SegmentPatternBuilder
012{
013
014
015    /** Creates a SegmentPatternBuilder.
016      */
017    SegmentPatternBuilder( DifferenceShadowsV _view, DifferenceParse _p )
018    {
019        view = _view;
020        p = _p;
021    }
022
023
024
025   // ------------------------------------------------------------------------------------
026
027
028    /** Returns the search pattern for the next segment of the diff commencing at
029      * <code>p.diffText.charAt( p.d )</code>.  The pattern includes only those contiguous
030      * lines that start with the specified prefix.  Pointer p.d is left at the start of
031      * the next segment, or at p.diffText.length if there is none.
032      *
033      *     @param prefix the line prefix of the diff segment.  This is either '-' or '+'
034      *       for a hunk segment, or space ' ' for a context segment.
035      *     @return the pattern, which may be the empty string '' if the current line does
036      *       not begin with the specified prefix.
037      *
038      *     @throws Warning if a failure is detected.
039      */
040    String nextPattern( final char prefix ) throws DifferenceShadowsV.Warning
041    {
042        if( view.rep() > 0 ) view.report( p.d + " --- nextDiffPattern '" + prefix + "'", 3, p.diff );
043        StringBuilderX.clear( b );
044        final String text = p.diffText;
045        if( p.d != 0 && text.charAt(p.d-1) != '\n' )
046        {
047            throw new DifferenceShadowsV.Warning( "Expected line start but not found: "
048              + p.positionMessage() );
049        }
050
051        boolean toExpectPrefix = true; // and force last to newline for sake of blank line detection:
052        char chPipe = '\n'; // last char processed, exclusive of prefix and artificial newlines
053        int chPipeCount = 1; // repetition count for chPipe
054        segment: for( int dN = text.length(); p.d < dN ; ++p.d )
055        {
056            char ch = text.charAt( p.d );
057
058          // Prefix at start of line
059          // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
060            if( toExpectPrefix )
061            {
062                if( ch != prefix ) break segment; // end of segment
063
064                toExpectPrefix = false;
065                continue segment; // eat the line prefix
066            }
067
068          // Piping from previous loops
069          // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
070            boolean isArtificialNewline = false; // till proven otherwise
071            if( ch != chPipe ) // change of character
072            {
073              // Softbreaks
074              // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
075                if( chPipe == SOFTBREAK_CHAR )
076                {
077                  // Artificial newlines, cf. a.diff.LineTransformer1.appendToWiki()
078                  // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
079                    if( ch == '\n' )
080                    {
081                        if( IntegerX.isOdd( chPipeCount ))
082                        {
083                            isArtificialNewline = true;
084                            --chPipeCount; // eat softbreak inserted as artificial newline indicator
085                        }
086                        chPipeCount /= 2; // eat byte stuffing
087                    }
088                    for( int c = 0; c < chPipeCount; ++c ) b.append( chPipe ); // unpipe
089                }
090            }
091
092            // Note: if isHangingBracketMystery and isMagicWordish are both coded inline
093            // below (even with no other code) then GWT compiler hangs (2.4.0+mca.2).
094            try
095            {
096              // Hanging bracket mysteries
097              // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
098                if( isHangingBracketMystery( ch )) continue segment;
099
100              // HTML <..>
101              // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
102                if( ch == '<' ) // see also isHangingBracketMystery above
103                {
104                    int t = p.d;
105                    if( text.charAt(++t) == 'r'
106                     && text.charAt(++t) == 'e'
107                     && text.charAt(++t) == 'f' )
108                    {
109                        final char tch = text.charAt( ++t );
110
111                      // Inline references <ref>..</ref>
112                      // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
113                        if(          tch == '>' )
114                        {
115                            b.append( "\\[\\d+\\]" ); // reference link body
116                            for( ;; )
117                            {
118                                ++t;
119                                final String line = p.subline( t );
120                                WIKI_REF_END_PATTERN.setLastIndex( 0 );
121                                if( WIKI_REF_END_PATTERN.exec(line) != null ) break;
122
123                                t += line.length(); // to the newline, if any
124                                ++t; // to the prefix, if any
125                                if( t >= dN || text.charAt(t) != prefix )
126                                {
127                                    p.d = t;
128                                    break segment; // no closing tag in this segment, done with it
129                                }
130                            }
131
132                            p.d = t + WIKI_REF_END_PATTERN.getLastIndex() - 1; // last char, i.e. '>'
133                            ch = text.charAt( p.d );
134                            continue segment;
135                        }
136
137                      // Reference listing <references/>
138                      // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
139                        else if(         tch == 'e'
140                         && text.charAt(++t) == 'r'
141                         && text.charAt(++t) == 'e'
142                         && text.charAt(++t) == 'n'
143                         && text.charAt(++t) == 'c'
144                         && text.charAt(++t) == 'e'
145                         && text.charAt(++t) == 's'
146                         && text.charAt(++t) == '/'
147                         && text.charAt(++t) == '>' )
148                        {
149                            b.append( ".*?" ); // whatever it generates for this
150                            p.d = t;
151                            ch = '>';
152                            continue segment;
153                        }
154                    }
155
156                  // - - -
157                    ch = skipToClosingBracket( '<', '>', prefix );
158                    continue segment; // eat it
159                }
160
161                if( ch == '>' ) // unmatched open bracket(s)
162                {
163                    skipToLast( ch );
164                    StringBuilderX.clear( b ); // clear the buffer, it was all HTML
165                    continue segment;
166                }
167
168              // Italics ''..'' and bold '''..'''
169              // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
170                if( ch == '\'' && skipToLast(ch) > 0 ) continue segment; // eat them
171
172              // Line formatting
173              // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
174                if( chPipe == '\n' ) // start of line
175                {
176                  // Lists * | # | : and non-TOC headings ; at start of line
177                  // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
178                    int tEnd = p.d;
179                    char tch = ch;
180                    while( tch == '*' || tch == '#' || tch == ':' || tch == ';' )
181                    {
182                        ++tEnd;
183                        if( tEnd >= dN ) break;
184
185                        tch = text.charAt( tEnd );
186                        ch = tch; // just to be correct
187                    }
188                    if( tEnd > p.d )
189                    {
190                        p.d = tEnd - 1; // skip to last
191                     // if( skipToLast( ' ' )) ch = ' '; // eat any spaces after it
192                     //// no, they are rendered
193                        continue segment;
194                    }
195
196                  // Preformatted text, leading space
197                  // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
198                    if( ch == ' ' ) continue segment; // eat the leading space
199                    // see also </pre> note in newline
200                }
201
202              // Links
203              // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
204                if( ch == '[' )
205                {
206                    int t = p.d + 1;
207                    final char tch = text.charAt( t );
208                    ch = skipToClosingBracket( '[', ']', prefix );
209                    int tEnd = p.d + 1;
210
211                  // Internal link [[..]]
212                  // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
213                    if( tch == '[' ) // two of them
214                    {
215                        ++t; // start of body
216                        if( ch == ']' )
217                        {
218                            --tEnd; // before ultimate ]
219                            if( text.charAt(p.d-1) == ']' ) --tEnd; // before any second ]
220                        }
221                        final int t2 = text.indexOf( '|', t );
222                        if( t2 != -1 && t2 < tEnd ) t = t2 + 1; // body specification
223                        if( t < tEnd ) RegExpX.appendQuoted( b, text, t, tEnd );
224                    }
225
226                  // External link [..]
227                  // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
228                    else
229                    {
230                        if( ch == ']' ) --tEnd;
231                        final int t2 = text.indexOf( ' ', t );
232                        if( t2 != -1 && t2 < tEnd )
233                        {
234                            t = t2 + 1; // body specification
235                            if( t < tEnd ) RegExpX.appendQuoted( b, text, t, tEnd );
236                        }
237                        else b.append( "\\[[0-9]+\\]" );
238                          // no body specified (never tested), collapses to [N]
239                    }
240                    continue segment;
241                }
242                // see also isHangingBracketMystery above
243
244              // Newlines
245              // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
246                if( ch == '\n' )
247                {
248                    toExpectPrefix = true;
249                    if( isArtificialNewline ) ch = 0; // for sake of chPipeCount below, ignore it
250                    else if( chPipe != '\n' ) b.append( "\n*" );
251                      // Normal line ending.  Allow multiple newlines to accomodate a
252                      // single edge case in which MediaWiki injects an extra newline when
253                      // it inserts a </pre> closing tag.  Allow zero newlines because
254                      // sometimes MediaWiki removes a newline, e.g. after closing
255                      // brackets of a template call.
256                 // else blank line, the repetition \n* should cover it
257                    continue segment;
258                }
259
260              // Magic words __..__ and plain underscores
261              // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
262                if( isMagicWordish( ch )) continue segment;
263
264              // Piped characters, for later processing
265              // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
266                if( ch == SOFTBREAK_CHAR ) continue segment;
267
268              // Section headlines =..= | ==..== | ===..=== | ...
269              // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
270                if( ch == '=' )
271                {
272                    if( chPipe == '\n' ) // start of line, definitely a headline
273                    {
274                        b.append( "(?:\\[\\S+\\])?" ); // allow preceding "[edit]" link
275                        b.append( " *" ); // a single space always precedes the content
276                    }
277                    else // maybe trailing headline bracket or just '=' signs, not sure
278                    {
279                        b.append( "\\=*" ); // allow them
280                    }
281
282                    skipToLast( ch ); // take all of them
283                    continue segment;
284                }
285
286              // Template or parser call {{..}} or formal template parameter {{{..}}}
287              // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
288                if( ch == '{' ) // assume for now it is actually {{ or {{{
289                {
290                    ch = skipToClosingBracket( '{', '}', prefix );
291                    b.append( ".*?" ); // might be rendered as anything
292                    continue segment;
293                }
294                // see also isHangingBracketMystery above, and note about "newlines"
295                // removed in template calls
296
297              // Default: ordinary, literal characters
298              // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
299                RegExpX.appendQuoted( b, ch );
300            }
301            finally
302            {
303                if( ch != 0 )
304                {
305                    if( ch == chPipe ) ++chPipeCount;
306                    else
307                    {
308                        chPipe = ch;
309                        chPipeCount = 1;
310                    }
311                }
312            }
313        }
314        return b.toString();
315    }
316
317
318
319//// P r i v a t e ///////////////////////////////////////////////////////////////////////
320
321
322    private final StringBuilder b = new StringBuilder( /*initial capacity*/1000 );
323
324
325
326    private boolean isHangingBracketMystery( final char ch )
327    {
328        if( ch == '}' || ch == ']' ) // unmatched open bracket(s)
329        {
330            skipToLast( ch );
331            StringBuilderX.clear( b ); // clear the buffer, it was all a mystery
332         // b.append( ".*?" ); // might be rendered as anything
333         //// but that would overreach at the head of the pattern [GH]
334            return true;
335        }
336
337        return false;
338    }
339
340
341
342    private boolean isHTML( final char ch )
343    {
344        if( ch == '}' || ch == ']' ) // unmatched open bracket(s)
345        {
346            skipToLast( ch );
347            StringBuilderX.clear( b ); // clear the buffer, it was all a mystery
348         // b.append( ".*?" ); // might be rendered as anything
349         //// but that would overreach at the head of the pattern [GH]
350            return true;
351        }
352
353        return false;
354    }
355
356
357
358    private boolean isMagicWordish( final char ch )
359    {
360        if( ch != '_' ) return false;
361
362        final String text = p.diffText;
363        int t = p.d;
364        if( text.charAt(++t) == '_' )
365        {
366            int m = t; // start of possible magic word
367            for( ;; )
368            {
369                ++t;
370                final int tch = text.charAt( t );
371                if( tch == '_' )
372                {
373                    if( text.charAt(++t) != '_' ) break; // not a magic word
374
375                    if( t - m == 3 + 1 ) // if it's 3 characters long
376                    {
377                        if( text.charAt(m++) == 'T'
378                         && text.charAt(m++) == 'O'
379                         && text.charAt(m) ==   'C' )
380                        {
381                            b.append( ".+?" );
382                              // might be rendered as anything (never tested)
383                        }
384                    }
385                    // else it's a magic word that doesn't render, so just eat it
386                    p.d = t;
387                    return true;
388                }
389
390                if( tch < 'A' || tch > 'Z' ) break; // not a magic word
391            }
392        }
393        b.append( ch );
394        return true; // actually just an underscore, but safely done with it
395    }
396
397
398
399    private final DifferenceParse p;
400
401
402
403    /** Advances cursor p.d to the closing bracket that matches the specified opening
404      * bracket, if possible.  Allows for bracket nesting.
405      *
406      *     @param prefix the line prefix of the current diff segment.  The cursor will
407      *       not be advanced onto a newline character unless the next line has the same
408      *       prefix.
409      *     @return the character advanced to, which might not be a closing bracket.
410      *
411      *     @throws AssertionError if the current character is not the specified
412      *       openingBracket.
413      */
414    private char skipToClosingBracket( final char openingBracket, final char closingBracket,
415      final char prefix )
416    {
417        final int dN = p.diffText.length();
418        int t = p.d;
419        assert p.diffText.charAt(t) == openingBracket;
420        char ch = openingBracket;
421        for( int bracketCount = 1;; ) // find closing bracket
422        {
423            ++t;
424            if( t >= dN )
425            {
426                p.d = t - 1; // last char
427                break;
428            }
429
430            final char tch = p.diffText.charAt( t );
431            if( tch == '\n' )
432            {
433                if( p.diffText.charAt(t+1) != prefix )
434                {
435                    p.d = t - 1; // back up prior to the newline
436                    break;
437                }
438            }
439            else
440            {
441                ch = tch;
442                if( ch == openingBracket ) ++bracketCount;
443                else if( ch == closingBracket )
444                {
445                    --bracketCount;
446                    if( bracketCount == 0 )
447                    {
448                        p.d = t;
449                        break;
450                    }
451                }
452            }
453        }
454        return ch;
455    }
456
457
458
459    /** If <code>p.d + 1</code> matches the specified character, then cursor p.d is
460      * advanced to the last contiguous character that also matches.
461      *
462      *     @return count of characters that match, which might be zero.
463      */
464    private int skipToLast( final char ch )
465    {
466        final int dN = p.diffText.length();
467        int tEnd = p.d + 1;
468        while( tEnd < dN && p.diffText.charAt(tEnd) == ch ) ++tEnd;
469        final int count = tEnd - p.d - 1;
470        if( count > 0 ) p.d = tEnd - 1; // skip to last
471        return count;
472    }
473
474
475
476    private static final char SOFTBREAK_CHAR = '\u21a9'; // per a.diff.LineTransformer1
477
478
479
480    private final DifferenceShadowsV view;
481
482
483    /** The global pattern of a refence end tag in wikitext.  Be sure to reset the
484      * {@linkplain RegExp#getLastIndex() last index} before using it.
485      */
486    private static final RegExp WIKI_REF_END_PATTERN = RegExp.compile( "</ref>", "g" );
487
488
489
490}