package votorola.a; // Copyright 2010-2013, Michael Allan. Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Votorola Software"), to deal in the Votorola Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicence, and/or sell copies of the Votorola Software, and to permit persons to whom the Votorola Software is furnished to do so, subject to the following conditions: The preceding copyright notice and this permission notice shall be included in all copies or substantial portions of the Votorola Software. THE VOTOROLA SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE VOTOROLA SOFTWARE OR THE USE OR OTHER DEALINGS IN THE VOTOROLA SOFTWARE. import com.hp.hpl.jena.query.*; import com.hp.hpl.jena.rdf.model.*; import java.io.*; import java.net.*; import java.text.*; import java.util.*; import java.util.regex.*; import javax.ws.rs.core.UriBuilder; import javax.xml.stream.*; import votorola.a.*; import votorola.g.*; import votorola.g.hold.*; import votorola.g.io.*; import votorola.g.lang.*; import votorola.g.logging.*; import votorola.g.net.*; import votorola.g.text.*; /** A directory of "semantic" data cached from the pollwiki * ~/votorola/in/wiki. Cache files are created with broad permissions and * may be overwritten by any runtime owner (vote-server account, servlet container and * others). The administrator may safely delete the contained files at runtime without * causing unrecoverable errors, but should not delete the directory itself until after * shutting down all runtime processes. * * @see ../s/manual.xht#wiki */ public @ThreadSafe final class WikiCache extends File { /** Constructs a WikiCache. */ WikiCache( VoteServer _voteServer ) throws IOException { super( _voteServer.inDirectory(), "wiki" ); voteServer = _voteServer; if( !exists() ) init_create(); } private final Churn init_create() throws IOException { if( !mkdir() ) throw new IOException( "unable to create directory: " + WikiCache.this ); setWritable( true, /*ownerOnly*/false ); return churn( /*fromScratch*/true ); } // ------------------------------------------------------------------------------------ /** Replaces any stale pages in the cache with fresh copies from the wiki. This * should be called periodically, e.g. via one of the administrative commands (such * as votrace) that has a --churn option. However it should not be * called too often as it has the side effect of clearing the poll cache. You may * instead want to query the pollwiki directly; Semantic MediaWiki 1.7 introduces a * MediaWiki API extension that might be more convenient than the RDF interface. * *

Note: only changes to page content are detected. A property change owing to a * change in the content of a template will not be detected. This is a BUG and the * current workaround is to manually delete the page in the cache.

* * @return this wiki cache. * * @see #lastChurnTime() * @see votorola.a.count.PollService.VoteServerScope.Run#ensurePoll(String) */ public WikiCache churn() throws IOException { churn( /*fromScratch*/false ); return WikiCache.this; } /** Returns the time of the last churn based on the timestamp of the churn's serial * file. * * @return time in milliseconds since the Epoch, or 0L if unknown. */ public long lastChurnTime() { return churnSerialFile.lastModified(); } /** Opens an input stream for the RDF of the specified wiki page in JSON format, UTF-8 * character encoding. Attempts first to use the local cache, falling back to the * wiki if necessary and caching the result for future calls. * * @param fullPageName the full name of the page, including any namespace. */ public FileInputStream openRDF_JSON( final String fullPageName ) throws IOException { return openRDF_JSON( fullPageName, /*toChurn*/false ); } /** Opens an input stream for the RDF of the specified wiki page in JSON format, UTF-8 * character encoding. Attempts first to use the local cache, falling back to the * wiki if necessary and caching the result for future calls. * * @param fullPageName the full name of the page, including any namespace. * @param toChurn whether or not to bypass the cache and fetch the RDF straight * from the wiki. If true, any previously cached file will be ignored and (if * the fetch succeeds) overrwritten. */ public FileInputStream openRDF_JSON( final String fullPageName, final boolean toChurn ) throws IOException { final long msChurnBefore = toChurn? Long.MAX_VALUE: Long.MIN_VALUE; retry: for( int retryCount = 0;; ++retryCount ) { final File cacheFile = ensure( fullPageName, msChurnBefore, /*allowNewFile*/true ); try { return new FileInputStream( cacheFile ); } catch( final FileNotFoundException x ) { if( cacheFile.exists() || retryCount > 0 ) throw x; // Else retry. The cacheFile was there earlier (ensured above), so it was // probably deleted by the administrator. A single retry should suffice. } } } /** Fetches the RDF of the specified wiki page in JSON format. Attempts first to use * the local cache, falling back to the wiki if necessary and caching the result for * future calls. * * @param fullPageName the full name of the page, including any namespace. */ public String readRDF_JSON( final String fullPageName ) throws IOException { return readRDF_JSON( fullPageName, /*toChurn*/false ); } /** Fetches the RDF of the specified wiki page in JSON format. Attempts first to use * the local cache, falling back to the wiki if necessary and caching the result for * future calls. * * @param fullPageName the full name of the page, including any namespace. * @param toChurn whether or not to bypass the cache and fetch the RDF straight * from the wiki. If true, any previously cached file will be ignored and (if * the fetch succeeds) overrwritten. */ public String readRDF_JSON( final String fullPageName, final boolean toChurn ) throws IOException { final BufferedReader in = new BufferedReader( new InputStreamReader( openRDF_JSON(fullPageName,toChurn), "UTF-8" )); try { return ReaderX.appendTo( new StringBuilder(), in ).toString(); } finally{ in.close(); } } //// P r i v a t e /////////////////////////////////////////////////////////////////////// private final Churn churn( final boolean fromScratch ) throws IOException { final String rclimit; final String rcendQueryComponent; // specifying the earliest to churn if( fromScratch ) { rclimit = "1"; // just enough to set tsLatestChurned (below) rcendQueryComponent = ""; // no need, fetching just one } else { rclimit = "500"; // per query, max for ordinary user Churn lastChurn = Churn.readObject( WikiCache.this ); if( lastChurn == null ) { LoggerX.i(getClass()).config( "lost churn history, cleaning out entire cache" ); if( !FileX.deleteRecursive( WikiCache.this )) throw new IOException( "unable to delete directory (please delete it manually): " + WikiCache.this ); return init_create(); } rcendQueryComponent = "&rcend=" + lastChurn.tsLatestChurned; // oldest, listed new to old // Churning at the last timestamp of the previous churn. If any pages at // that timestamp are actually present in the cache, they will be churned // again, perhaps redundantly. This is needed to ensure no change slips // through in a burst of multiple changes that all have the same timestamp. } long msChurnBefore = Long.MAX_VALUE; // reset below, will prevent repeat churning of multiply changed/listed pages String tsLatestChurned = null; // so far queryChain: for( String queryContinuation = "";; ) { // Vetting is not yet implemented in churns. No revisions are excluded, so // the the server is unshielded from abusive edits. We might implement // vetting based on a cooling-off period (P). This would ignore recent // revisions (now - P), allowing time for abuse to be detected and corrected. // This would require that each query overlap the previous by at least P, in // order that any rejected revisions were again reconsidered for churning. // Bypass of P might be allowed for sysop changes, user changes in user pages, // leader changes in polls, and so forth. Deliberate churns (single page // reconstructions by user request) would have to abide by complementary // rules. All of this would be somewhat restricted by Semantic MediaWiki's // limitations. RDF export applies only to the current page revision, so only // that one can ever be accepted for churning. Even if an earlier revision // had cooled for P, if the current had not, then neither could be accepted // for churning. final Spool spool = new Spool1(); try { final HttpURLConnection http; try { // http://www.mediawiki.org/wiki/API:Query_-_Lists#recentchanges_.2F_rc final URI s = voteServer.pollwiki().scriptURI(); final URI queryURI = new URI( s.getScheme(), s.getAuthority(), s.getPath() + "/api.php", /*query*/"action=query&list=recentchanges" + rcendQueryComponent + "&rclimit=" + rclimit + "&rcprop=title|timestamp&rctype=edit&format=xml" + queryContinuation, /*fragment*/null ); LoggerX.i(getClass()).fine( "querying wiki " + queryURI ); http = (HttpURLConnection)( queryURI.toURL().openConnection() ); } catch( URISyntaxException x ) { throw new RuntimeException( x ); } URLConnectionX.connect( http ); spool.add( new Hold() { public void release() { http.disconnect(); } }); final InputStream in = http.getInputStream(); spool.add( new Hold() { public void release() { try{ in.close(); } catch( Exception x ) { throw VotorolaRuntimeException.castOrWrapped( x ); }} }); final XMLStreamReader r = MediaWiki.newXMLStreamReader( in, spool ); if( msChurnBefore == Long.MAX_VALUE ) msChurnBefore = System.currentTimeMillis(); // After having detected the most recent change, so no gap in which // stale files might get cached and retained. queryContinuation = null; while( r.hasNext() ) { r.next(); if( r.isStartElement() && "rc".equals( r.getLocalName() )) { ensure( r.getAttributeValue(/*ns*/null,"title"), msChurnBefore, /*allowNewFile*/false ); if( tsLatestChurned == null ) // then this is the last one (first in list) { tsLatestChurned = r.getAttributeValue( /*ns*/null, "timestamp" ); } } else if( !fromScratch && r.isStartElement() && "recentchanges".equals( r.getLocalName() )) { final String rcstart = r.getAttributeValue( /*ns*/null, "rcstart" ); if( rcstart != null ) // also serves to gaurd against clobbering. Up to two elements are expected with this same name, only one of which has the sought for attribute. { queryContinuation = "&rcstart=" + rcstart; } } else if( r.isStartElement() && "error".equals( r.getLocalName() )) { throw new MediaWiki.APIError( r ); } } if( queryContinuation == null ) break queryChain; } catch( XMLStreamException x ) { throw new IOException( x ); } finally{ spool.unwind(); } } if( tsLatestChurned == null ) // then it must be a brand new wiki, with no changes { final SimpleDateFormat iso8601Formatter = // new SimpleDateFormat( SimpleDateFormatX.ISO_8601_PATTERN_C ); /// but MediaWiki cannot parse 2010-05-02T18:08:01-0400, so use GMT and 'Z' suffix new SimpleDateFormat( SimpleDateFormatX.ISO_8601_LOCAL_PATTERN + "'Z'" ); iso8601Formatter.setTimeZone( TimeZone.getTimeZone( "GMT" )); tsLatestChurned = iso8601Formatter.format( new Date( System.currentTimeMillis() - // back far enough to cover clock mis-sync between hosts 1000/*ms per s*/ * 3600/*s per hour*/ * 24/*hours per day*/ * 7/*days*/ )); } final Churn churn = new Churn( tsLatestChurned ); churn.writeObject( WikiCache.this ); return churn; } private final File churnSerialFile = new File( WikiCache.this, Churn.SERIAL_FILE_NAME ); /** @param msChurnBefore the earliest modtime acceptable without churning. If the * cached file is older, it will be churned. * @param allowNewFile whether to allow the addition of a new file, or only churning * of the existing one. * @return the corresponding file from the cache; or null if allowNewFile is false, * and the file did not already exist in the cache. */ private File ensure( final String fullPageName, final long msChurnBefore, final boolean allowNewFile ) throws IOException { // Currently this caches JSON file of the entire RDF export, which is huge. In // future we'll support a soft-scripted transform of the result to a minimal JSON // (or perhaps any format) that will then be cached and returned. final File cacheFile = newCacheFile( fullPageName ); if( !cacheFile.exists() ) { if( !allowNewFile ) return null; FileX.traverse( cacheFile, // create parent directories and make them writable by all new FileFilter() // up { public boolean accept( final File f ) { return !f.exists(); } }, new FileFilter() // down { public boolean accept( final File f ) { if( f != cacheFile ) { f.mkdir(); f.setWritable( true, /*ownerOnly*/false ); } return true; } }); } if( !cacheFile.exists() || cacheFile.lastModified() < msChurnBefore ) { final Model data = ModelFactory.createDefaultModel(); final Spool spool = new Spool1(); try { final InputStream in = new BufferedInputStream( newRDFImportStream( fullPageName, spool )); spool.add( new Hold() { public void release() { try{ in.close(); } catch( Exception x ) { throw VotorolaRuntimeException.castOrWrapped( x ); }} }); data.read( in, /*base, not needed, no relative URLs*/null ); } finally{ spool.unwind(); } final File tmpFile = File.createTempFile( cacheFile.getName(), ".json" ); try { final Query q = QueryFactory.create( "SELECT * WHERE { ?s ?p ?o }" ); // http://tech.groups.yahoo.com/group/jena-dev/message/23035 final QueryExecution qexec = QueryExecutionFactory.create( q, data ); final ResultSet rs = qexec.execSelect(); final BufferedOutputStream out = new BufferedOutputStream( new FileOutputStream( tmpFile )); try { ResultSetFormatter.outputAsJSON( out, rs ); // appears to output UTF-8 } finally{ out.close(); } tmpFile.setWritable( true, /*ownerOnly*/false ); // cacheFile.delete(); // non-atomic with rename, but rename alone should work: FileX.renameFromDefaultToMv( tmpFile, cacheFile ); } finally{ if( tmpFile.isFile() ) tmpFile.delete(); } // clean up from exception } return cacheFile; } private File newCacheFile( final String fullPageName ) { final MatchResult m = MediaWiki.parsePageName( fullPageName ); if( m == null ) throw new VotorolaRuntimeException( "malformed page name: " + fullPageName ); String namespace = m.group( 1 ); if( namespace == null ) namespace = "Main"; final String pageName = m.group( 2 ); return new File( WikiCache.this, namespace + File.separator + (File.separatorChar == '/'? pageName: pageName.replaceAll("/",File.separator)) + ".json" ); } /** @param spool the spool for closing resources. * @return the stream from which to read the RDF. Close it when you are finished * with it. */ private InputStream newRDFImportStream( final String fullPageName, final Spool spool ) throws IOException { final UriBuilder ub = UriBuilder.fromUri( voteServer.pollwiki().scriptURI() ); ub.path( "index.php" ); // ub.queryParam( "page", fullPageName ); /// ignored by Semantic MediaWiki 1.7.1, it instead serves query form. so append as subpage: ub.queryParam( "title", "Special:ExportRDF/" + fullPageName ); ub.queryParam( "backlinks", "0" ); ub.queryParam( "recursive", "0" ); final URI uri = ub.build(); LoggerX.i(getClass()).fine( "querying wiki " + uri ); final HttpURLConnection http = (HttpURLConnection)( uri.toURL().openConnection() ); URLConnectionX.connect( http ); spool.add( new Hold() { public void release() { http.disconnect(); } }); return http.getInputStream(); } private final VoteServer voteServer; // ==================================================================================== private static final class Churn implements Serializable { private static final long serialVersionUID = 0L; private Churn( String _tsLatestChurned ) { if( _tsLatestChurned == null ) throw new NullPointerException(); // fail fast tsLatestChurned = _tsLatestChurned; } // ```````````````````````````````````````````````````````````````````````````````` static Churn readObject( final WikiCache wikiCache ) throws IOException { final File serialFile = wikiCache.churnSerialFile; if( !serialFile.isFile() ) return null; try { return (Churn)FileX.readObject( serialFile ); } catch( ClassNotFoundException x ) { throw new RuntimeException( x ); } } final void writeObject( final WikiCache wikiCache ) throws IOException { final File serialFile = wikiCache.churnSerialFile; FileX.writeObject( Churn.this, serialFile); serialFile.setWritable( true, /*ownerOnly*/false ); } // -------------------------------------------------------------------------------- private static final String SERIAL_FILE_NAME = "lastChurn.serial"; /** The timestamp of the latest page revision in this churn. */ private final String tsLatestChurned; } }