001package votorola.a.diff; // Copyright 2010-2011, Michael Allan.  Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Votorola Software"), to deal in the Votorola Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicence, and/or sell copies of the Votorola Software, and to permit persons to whom the Votorola Software is furnished to do so, subject to the following conditions: The preceding copyright notice and this permission notice shall be included in all copies or substantial portions of the Votorola Software. THE VOTOROLA SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE VOTOROLA SOFTWARE OR THE USE OR OTHER DEALINGS IN THE VOTOROLA SOFTWARE.
002
003import java.io.*;
004import java.net.*;
005import java.nio.charset.*;
006import java.util.regex.*;
007import votorola.g.hold.*;
008import votorola.g.io.*;
009import votorola.g.lang.*;
010import votorola.g.logging.*;
011import votorola.g.net.*;
012
013
014/** A bidirectional transformer of wikitext lines, that breaks incoming wikitext into
015  * multiple lines at punctuation marks.  Lines thus broken are marked by an appended
016  * soft-break character.  We need this because diff/patch tools are line oriented,
017  * depending on newline characters for context; wheras wikitext is typically
018  * paragraph-oriented and has very long lines.
019  *
020  * <h3>Bracket markers to distinguish meta-content from proper content</h3>
021  *
022  * <p>The line transformer recognizes markers that distinguish meta-content from proper
023  * content.  The meta-content must bracket the page like a header and footer.  It is
024  * marked by lines that contain BRAC_LINE_MARKER_PATTERN.  These lines and the content
025  * above or below are ignored for differencing purposes.  Pages are thus structured as
026  * follows:</p>
027  *
028  * <pre>
029  *     The optional high bracket, or header, ends with a line
030  *     containing this marker &lt;!--voHiBrac--&gt;
031  *
032  *     The content proper for differencing purposes
033  *
034  *     This &lt;!--voLoBrac--&gt; marks the first line
035  *     of the optional low bracket, or footer.</pre>
036  *
037  * <p>Note that this transformer may append a final newline to outgoing wikitext that was
038  * not in the original.  This is unlikely to have much effect, because MediaWiki's own
039  * editor has the same behaviour.</p>
040  *
041  *     @see <a href='http://reluk.ca/w/Category:Draft#Bracket_markers_to_distinguish_meta-content_from_proper_content'
042  *       >Category:Draft Bracket markers</a>
043  */
044public final @ThreadSafe class LineTransformer1
045{
046
047    // In addition to high and low brackets, we might have allowed floating spans of
048    // ignored text in middle.  But it would be tricky.  The incoming appendFromWiki can
049    // reduce the spans to empty stubs with locally unique identifiers, such that the
050    // outgoing appendToWiki can replace them with the correct original content.  The
051    // stubs would have to be paired between the two files, however, such that equivalent
052    // stubs are given equal identifiers that don't show up as differnces in the result.
053    // But how to establish that equivalence?  Maybe leave it, till there's a use case.
054
055
056    /** Appends a line of incoming text from the wiki.
057      */
058    void appendFromWiki( final String line, final Writer out ) throws IOException
059    {
060        final int cLast = line.length() - 1;
061        if( cLast >= 0 )
062        {
063            int trailingSoftbreakCount = 0; // in original
064            int letterCount = 0;
065            char ch = 'a'; // anything unimportant
066            for( int c = 0;; ++c )
067            {
068                boolean isLast = c == cLast;
069                final char chPrior = ch;
070                ch = line.charAt( c );
071                boolean toSoftbreak = false; // thus far
072                if( ch == SOFTBREAK_CHAR ) ++trailingSoftbreakCount;
073                else
074                {
075                    trailingSoftbreakCount = 0; // reset counter
076                    if( Character.isLetter( ch )) ++letterCount;
077
078                    softbreak:
079                    {
080                        if( isLast ) break softbreak; // not at very end, where it breaks anyway
081
082                        if( letterCount < 3 ) break softbreak;
083                          // Not on "1. Title" or ":: indented text".  Rather wait for
084                          // a few letters to appear.  But not too large a count, or a
085                          // small insertion might (in some cases) effect a cascade
086                          // across multiple lines.
087
088                        if( ch == ' ' ) // on space, depending on adjacent chars
089                        {
090                            assert !isLast;
091                            char chNext = line.charAt( c + 1 );
092                            if( chNext == '<' || chNext == '[' ) toSoftbreak = true; // start tag or or start of link
093                            else if( chPrior == '>' || chPrior == ']' // end tag or end of link
094                             || chPrior == '-'
095                             || chPrior == '\u2014' // em dash
096                             || chPrior != '/' // don't break "[http://etc/path/ link]"
097                               && Character.getType(chPrior) == Character.OTHER_PUNCTUATION )
098                                  // .,;:?! and the like, per http://www.fileformat.info/info/unicode/category/Po/list.htm
099                            {
100                                while( chNext == ' ' ) // gobble contiguous spaces
101                                {
102                                    out.write( ch );
103                                    ++c;
104                                    isLast = c == cLast;
105                                    if( isLast ) break;
106
107                                    chNext = line.charAt( c + 1 );
108                                }
109
110                                toSoftbreak = !isLast;
111                            }
112                        }
113                        else if( Character.getType(ch) == Character.OTHER_PUNCTUATION )
114                        {
115                            assert !isLast;
116                            final int cNext = c + 1;
117                            final char chNext = line.charAt( cNext );
118                            if( chNext == '[' ) toSoftbreak = true; // "end of phrase,[ref link]"
119                            else if( chNext == '<' && cNext < cLast )
120                            {
121                                final char chNextNext = line.charAt( cNext + 1 );
122                                toSoftbreak = chNextNext != '/';
123                                  // so "end of phrase,<ref>", but not "end of ref.</ref>"
124                            }
125                        }
126                    }
127                }
128
129                out.write( ch );
130                if( isLast ) for( int s = 0; s < trailingSoftbreakCount; ++s )
131                {
132                    out.write( SOFTBREAK_CHAR ); // byte-stuff
133                }
134
135                if( toSoftbreak )
136                {
137                    out.write( SOFTBREAK_CHAR ); // indicator
138                    out.write( '\n' );          // actual break
139                }
140
141                if( isLast ) break;
142            }
143        }
144        out.write( '\n' );
145    }
146
147
148
149    /** Appends a line of outgoing text, destined for the wiki.  This undoes the
150      * incoming transformation.
151      */
152    public void appendToWiki( final String line, final Writer out ) throws IOException
153    {
154        // cf. votorola.s.gwt.mediawiki.SegmentPatternBuilder
155
156        int lineLength = line.length();
157        int trailingSoftbreakCount = 0;
158        for( int c = lineLength - 1; c >= 0; --c )
159        {
160            if( line.charAt(c) != SOFTBREAK_CHAR ) break;
161
162            ++trailingSoftbreakCount;
163        }
164
165        final boolean toBreak;
166        if( IntegerX.isOdd( trailingSoftbreakCount )) // then we inserted a softbreak
167        {
168            --lineLength; // eat the indicator
169            toBreak = false; // eat the break
170        }
171        else toBreak = true;
172
173        if( trailingSoftbreakCount > 1 ) lineLength -= trailingSoftbreakCount / 2; // un-byte-stuff
174
175        out.write( line, 0, lineLength );
176        if( toBreak ) out.write( '\n' );
177    }
178
179
180
181    /** The pattern of a "low bracket" or "high bracket" marker in a line of wikitext.
182      * Group (1) contains the bracket type.  If it equals "Hi", then it marks the last
183      * line of the "high bracket".  The high bracket (basically a header) includes all
184      * lines from the start of the page to this line.  If it equals "Lo", then it marks
185      * the first line of the "low bracket".  The low bracket (a footer) includes all
186      * lines form this one to the end of the page.  All lines of the high and low
187      * brackets are ignored for differencing purposes.
188      */
189    static final Pattern BRAC_LINE_MARKER_PATTERN = Pattern.compile(
190      "<!--.*\\bvo(Hi|Lo)Brac\\b.*-->" );
191
192
193
194    /** Downloads the wikitext source of the specified page, and splits it into one or
195      * more temporary files.
196      *
197      *     @param idType one of "curid" or "oldid".
198      *     @param id the page identifier (curid) or revision identifier (oldid).
199      *     @param s the base URI for script execution in the wiki, without a trailing
200      *       slash (/).
201      *     @param prefix the base {@linkplain File#createTempFile(String,String) prefix}
202      *       for the temporary files.
203      *     @param fileSpool the spool to hold temporary files.  If unwound, the files
204      *       will be deleted.
205      *
206      *     @return an array of up to three files: [0] a file containing the transformed
207      *       content of the high bracket up to and including BRAC_LINE_MARKER_PATTERN
208      *       (Hi), or null if there is none; [1] a file containing the transformed main
209      *       body of the source text; and [2] a file containing the untransformed low
210      *       bracket, up to and including BRAC_LINE_MARKER_PATTERN (Lo), or null if there
211      *       is none.  Note that only the low bracket file is untransformed.
212      *
213      *     @see <a href='http://www.mediawiki.org/wiki/Manual:Parameters_to_index.php'
214      *                         >www.mediawiki.org/wiki/Manual:Parameters_to_index.php</a>
215      */
216    public File[] fetchPageAsFile( final URI s, final String idType, final int id,
217      final String prefix, final Spool fileSpool ) throws IOException
218    {
219        final HttpURLConnection http;
220        try
221        {
222            final URI uri = new URI( s + "/index.php?action=raw&" + idType + "=" + id );
223            LoggerX.i(LineTransformer1.class).fine( "querying wiki " + uri );
224            http = (HttpURLConnection)( uri.toURL().openConnection() );
225        }
226        catch( URISyntaxException x ) { throw new RuntimeException( x ); }
227
228        final File file = File.createTempFile( prefix, "." + id );
229        fileSpool.add( new FileHold( file ));
230        File fileHi = null;
231        File fileLo = null;
232        URLConnectionX.connect( http );
233        try
234        {
235            final BufferedReader in = new BufferedReader( new InputStreamReader(
236              http.getInputStream(), "UTF-8" )); // assuming UTF-8, maybe FIX by reading the HTTP header
237            try
238            {
239                final Charset nativeCharset = Charset.defaultCharset();
240                BufferedWriter out = new BufferedWriter( new OutputStreamWriter(
241                  new FileOutputStream(file), nativeCharset ));
242                fetch: try
243                {
244                    String l;
245
246                  // High bracket and main body
247                  // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
248                    for( ;; )
249                    {
250                        l = in.readLine();
251                        if( l == null ) break fetch;
252
253                        final Matcher m = BRAC_LINE_MARKER_PATTERN.matcher( l );
254                        final String position = m.find()? m.group(1): null;
255                        if( "Lo".equals( position )) break;
256
257                        appendFromWiki( l, out ); // transform
258                        if( position != null )
259                        {
260                            assert "Hi".equals( position );
261                            out.close();
262
263                            fileHi = File.createTempFile( prefix + "Hi", "." + id );
264                            fileSpool.add( new FileHold( fileHi ));
265                            FileX.copyAs( fileHi, file );
266                            out = new BufferedWriter( new OutputStreamWriter(
267                              new FileOutputStream(file), nativeCharset ));
268                        }
269                    }
270                    out.close();
271
272                  // Low bracket
273                  // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
274                    fileLo = File.createTempFile( prefix + "Lo", "." + id );
275                    fileSpool.add( new FileHold( fileLo ));
276                    out = new BufferedWriter( new OutputStreamWriter(
277                      new FileOutputStream(fileLo), nativeCharset ));
278                    for( ;; )
279                    {
280                        out.write( l );
281                        out.newLine();
282
283                        l = in.readLine();
284                        if( l == null ) break;
285                    }
286                }
287                finally{ out.close(); }
288            }
289            finally{ in.close(); }
290        }
291        finally{ http.disconnect(); }
292
293        return new File[] { fileHi, file, fileLo };
294    }
295
296
297
298    /** Runs a loopback test of this line transformer.
299      */
300    void test() throws IOException
301    {
302      // Original
303      // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
304        final String originalText;
305        {
306            final StringBuilder sB = new StringBuilder();
307            sB.append( "This is line 1, which has a comma.\n" );
308            sB.append( "This is line 2 and it has no punctuation.\n" );
309            sB.append( "\n" );
310            sB.append( "\n" );
311            sB.append( "The two preceding lines were empty.\n" );
312            sB.append( "This sentence is broken\n" );
313            sB.append( "across two lines\n" );
314            sB.append( "* This is bulleted.\n" );
315            sB.append( ": This is indented.\n" );
316            sB.append( ":: This is double indented.\n" );
317            sB.append( "A. This is a section title\n" );
318            sB.append( "This line ends with a reference.<ref>This is a reference.</ref>\n" );
319            sB.append( "This line ends with a reference link.[This is a reference link.]\n" );
320            sB.append( "This line has a reference,<ref>This is a reference.</ref> inside.\n" );
321            sB.append( "This line has a reference link.[This is a reference link.] inside.\n" );
322            sB.append( "Here is a <em>tagged phrase</em> in mid-line.\n" );
323            sB.append( "This line ends with a space. \n" );
324            sB.append( " This begins with a space.\n" );
325            sB.append( "This ends with a softbreak." ).append(
326              LineTransformer1.SOFTBREAK_CHAR ).append( '\n' );
327            sB.append( "A softbreak " ).append( LineTransformer1.SOFTBREAK_CHAR ).append(
328              " is contained here.\n" );
329            sB.append( "Hello.  These two sentences are separated by a double space.\n" );
330            sB.append( "This,line,has,,.punctuation!but?no.spaces.\n" );
331            sB.append( "This comma,  has an extra space.\n" ); // end text with newline, else test will fail per LineTransformer1
332            originalText = sB.toString();
333        }
334        System.out.println( "originalText" );
335        System.out.println( "------------" );
336        System.out.println( originalText );
337        System.out.println( "--- end\n" );
338
339      // Transformed in
340      // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
341        final String transformedInText;
342        {
343            final BufferedReader in = new BufferedReader( new StringReader( originalText ));
344            try
345            {
346                final StringWriter outS = new StringWriter();
347                final BufferedWriter out = new BufferedWriter( outS );
348                try
349                {
350                    for( ;; )
351                    {
352                        final String l = in.readLine();
353                        if( l == null ) break;
354
355                        appendFromWiki( l, out );
356                    }
357                }
358                finally{ out.close(); }
359
360                transformedInText = outS.toString();
361                System.out.println( "transformed in" );
362                System.out.println( "--------------" );
363                System.out.println( transformedInText );
364                System.out.println( "--- end\n" );
365            }
366            finally{ in.close(); }
367        }
368
369      // Transformed back out
370      // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
371        final String transformedOutText;
372        {
373            final BufferedReader in = new BufferedReader( new StringReader( transformedInText ));
374            try
375            {
376                final StringWriter outS = new StringWriter();
377                final BufferedWriter out = new BufferedWriter( outS );
378                try
379                {
380                    for( ;; )
381                    {
382                        final String l = in.readLine();
383                        if( l == null ) break;
384
385                        appendToWiki( l, out );
386                    }
387                }
388                finally{ out.close(); }
389
390                transformedOutText = outS.toString();
391                System.out.println( "transformed out" );
392                System.out.println( "---------------" );
393                System.out.println( transformedOutText );
394                System.out.println( "--- end\n" );
395            }
396            finally{ in.close(); }
397        }
398
399        System.out.println( "transformedOutText.equals(originalText) = "
400                           + transformedOutText.equals(originalText) );
401    }
402
403
404
405//// P r i v a t e ///////////////////////////////////////////////////////////////////////
406
407
408    private static final char SOFTBREAK_CHAR = '\u21a9';
409
410
411
412}