package votorola.a.diff.harvest; // Copyright 2012, Christian Weilbach. Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Votorola Software"), to deal in the Votorola Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicence, and/or sell copies of the Votorola Software, and to permit persons to whom the Votorola Software is furnished to do so, subject to the following conditions: The preceding copyright notice and this permission notice shall be included in all copies or substantial portions of the Votorola Software. THE VOTOROLA SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE VOTOROLA SOFTWARE OR THE USE OR OTHER DEALINGS IN THE VOTOROLA SOFTWARE. import java.io.BufferedReader; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.io.PrintStream; import java.io.UnsupportedEncodingException; import java.sql.SQLException; import java.text.DateFormatSymbols; import java.text.ParseException; import java.text.SimpleDateFormat; import java.util.Arrays; import java.util.Collections; import java.util.Date; import java.util.HashMap; import java.util.LinkedList; import java.util.List; import java.util.Locale; import java.util.Map; import java.util.TimeZone; import java.util.concurrent.atomic.AtomicBoolean; import java.util.logging.Level; import java.util.logging.Logger; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.apache.commons.lang3.StringEscapeUtils; import com.ibm.icu.util.Calendar; import com.ibm.icu.util.GregorianCalendar; import votorola.a.diff.harvest.auth.BasicAuthenticator; import votorola.a.diff.harvest.cache.HarvestCache; import votorola.a.diff.harvest.kick.Kick; import votorola.a.diff.harvest.kick.KickReceiver; import votorola.a.diff.harvest.kick.Kicker; import votorola.a.diff.harvest.kick.UpdateKick; import votorola.a.diff.harvest.run.AbstractFetcher; import votorola.a.diff.harvest.run.Fetcher; import votorola.a.diff.harvest.run.HarvestRunner; import votorola.g.lang.ThreadSafe; import votorola.g.lang.Warning; import votorola.g.logging.LoggerX; /** *

* A harvester implementation for pipermail archives. Pipermail archives are * generated by mailman. * Harvested messages are stored in {@linkplain HarvestCache} and fetchers are * scheduled with {@linkplain HarvestRunner}. *

* *

* Basically we do a depth first search * sorted by date. Web-view on the remote archive for this harvester: *

*
* *
 * Tree of linked pages.     Level      Example remote archive URLs
 *             r1              {@linkplain RootFetcher root}       http://mail.zelea.com/list/votorola/    
 *            / \
 *           /   \
 *          /     \
 *         /       \
 *       i2        i3          {@linkplain PeriodFetcher index}      2010-Jan/date.html                      
 *       / \       /|\
 *      /   \     / | \
 *     m4    m5 m6 m7 m8       {@linkplain MessageFetcher message}    2010-Jan/003321.html
 * 
* *

* All fetchers depend on one HTTP request. Runtime steps for these * {@linkplain AbstractFetcher Fetchfetchers}: *

    *
  1. Create fetcher and {@linkplain HarvestRunner#scheduleLast(Fetcher) * schedule} it
  2. *
  3. Once {@linkplain HarvestRunner} handles the fetcher, it asynchronously * fetches the HTTP remote page and runs the fetcher in its thread pool
  4. *
  5. Having access to {@linkplain AbstractFetcher#getInputStream()} now, the * fetcher parses the page
  6. *
  7. r1, i2, i3 start fetchers for the index of URLs parsed out of the page
  8. *
*

*

* Result after r1: i2-i3, after i2: m4-m5-i3 after i3: * m6-m7-m8.
* The state is save by anonymously extending the last Messagefetcher like m5 or * m8 appropriately. *

*

* Note: This is only an example. You can submit your * {@linkplain votorola.a.diff.harvest.run.Fetcher} differently and you can also * save state differently. Usage of the {@linkplain HarvestRunner} is * recommended for graceful I/O handling though. *

* * @see votorola.a.diff.harvest The communication diagram for details of the * overall harvesting concept. */ public class PipermailHarvester { private final static Logger LOGGER = LoggerX.i(PipermailHarvester.class); // / / / Harvesting services private final HarvestCache hCache; /** * Since PipermailHarvester.PAT_AUTHOR allows us to parse the E-Mail out of * the page, we can generate the MailishUsername directly and therefore use * the verifier which just compares mailish usernames of the auhors of the * difference with the author of the message. */ private final BasicAuthenticator aVerifier; /** * Global singleton scheduler. */ private final HarvestRunner runner = HarvestRunner.i(); // / / / Forum specific settings // list-info page /** * Pattern for list-info page. (for setup) */ public final static Pattern PAT_LISTINFO = Pattern .compile(""); // root page /** * Find input encoding. */ public final static Pattern PAT_INPUTENC = Pattern .compile("text/html; charset=([a-z0-9\\-]+)"); /** * Parse language. */ public final static Pattern PAT_LANG = Pattern .compile(""); /** * Pattern to parse sub-list of posts for each month in * {@linkplain RootFetcher}. If period is in years: 2012/date.html ... in * months: 2012-October/date.html TODO add/fix weekly pattern if used * somewhere */ public final static Pattern PAT_PERIOD = Pattern .compile("=\"((\\d+)-?(\\w+)?)/date.html\"\\>\\[ \\S+ \\]"); /** * Pattern to parse list of posts in this month list in * {@linkplain PeriodFetcher}. */ public final static Pattern PAT_POST = Pattern .compile("=\"(\\S+\\.html)\"\\>"); /** * Pattern to find author email in {@linkplain MessageFetcher}. */ public final static Pattern PAT_AUTHOR = Pattern.compile("HREF=\"mailto"); /** * Pattern to parse author email in {@linkplain MessageFetcher}. */ public final static Pattern PAT_AUTHOR2 = Pattern.compile("TITLE.+\\>(.+)"); /** * Pattern to parse sent date in {@linkplain MessageFetcher}. */ public final static Pattern PAT_SENTDATE = Pattern .compile("(\\S+)\\s+(\\S+)\\s+(\\S+)\\s+(\\S+)\\s+(\\S+)\\s+(\\S+)"); // example with grouping: (Di )(Okt )(30 )(23:48:50 )(CET )(2012) // or: (Jeu )(1 )(Nov )(12:00:20 )(UTC )(2012) // needed to always parse English timezone in Messagefetcher /** * Pattern to find the start of the body of the message in * {@linkplain MessageFetcher}. */ public final static Pattern PAT_CONTENT_START = Pattern .compile("\\"); /** * Pattern to find the end of the body of the message in * {@linkplain MessageFetcher}. */ public final static Pattern PAT_CONTENT_END = Pattern.compile("\\"); private final StateTable stateTable; /** * Structure to save state of a single archive. declared package private * for example documentation */ @Warning("non-API") @ThreadSafe class Archive { class SetupFetcher extends AbstractFetcher { final Archive archive; public SetupFetcher(final Archive archive) { super(archive.archiveUrl(), ""); this.archive = archive; } @Override public void run() { BufferedReader in; String tmpListInfo = ""; String enc = ""; try { // expect the first fetch to work for html parsing only with // iso-8859-1 encoding (?) in = new BufferedReader(new InputStreamReader( getInputStream(), "iso-8859-1")); for (String line = in.readLine(); line != null; line = in .readLine()) { Matcher m; m = PAT_INPUTENC.matcher(line); if (m.find()) { enc = m.group(1); archive.setInputEncoding(enc); } m = PAT_LISTINFO.matcher(line); if (m.find()) { tmpListInfo = m.group(1); } } } catch (UnsupportedEncodingException e) { final String msg = "Bug: Could not parse list-info HTML with iso-8859-1."; LOGGER.severe(msg); archive.setFaulty(msg); } catch (IOException e) { final String msg = "Bug: Could not get list-info HTML."; LOGGER.severe(msg); archive.setFaulty(msg); } if (enc.isEmpty() || tmpListInfo.isEmpty()) { final String msg = "Could parse setup for archive:" + (enc.isEmpty() ? " encoding not found" : "") + (tmpListInfo.isEmpty() ? ", list-info page not found" : ""); LOGGER.severe(msg); archive.setFaulty(msg); return; } archive.setInputEncoding(enc); final String listInfo = tmpListInfo; runner.scheduleFirst(new Fetcher() { @Override public void run() { final String enc = archive.inputEncoding(); final String langCode = indexPage(inputStream, PAT_LANG, enc).get(0); final Locale locale = new Locale(langCode); archive.setLocale(locale); } @Override public void fault(final String msg) { archive.setFaulty(msg); } // / / Fetcher @Override public String archiveUrl() { return archiveUrl; } @Override public String url() { return listInfo; } private InputStream inputStream; @Override public void setInputStream(InputStream in) { this.inputStream = in; } @Override public void setStatusCode(int code) { if (code >= 400) { final String error = "Failed to fetch list-info page: " + listInfo; LOGGER.warning(error); archive.setFaulty(error); } } }); } @Override public void fault(final String msg) { archive.setFaulty(msg); } } private final String archiveUrl; public Archive(final String archiveUrl) { this.archiveUrl = archiveUrl; runner.scheduleFirst(new SetupFetcher(this)); } public synchronized String archiveUrl() { return archiveUrl; } private volatile boolean isFaulty = false; public synchronized boolean isFaulty() { return isFaulty; } private volatile String faultMsg = ""; public synchronized String fault() { return faultMsg; } public synchronized void setFaulty(final String msg) { this.faultMsg = msg; isFaulty = true; } private volatile Locale locale = null; public synchronized void setLocale(final Locale locale) { if (locale == null) { final String msg = "Could not parse locale, is null."; LOGGER.severe(msg); setFaulty(msg); return; } this.locale = locale; } private volatile String inputEncoding = ""; public synchronized void setInputEncoding(final String encoding) { if (encoding.isEmpty()) { final String msg = "Parsed encoding is empty."; LOGGER.severe(msg); setFaulty(msg); return; } inputEncoding = encoding; } public synchronized String inputEncoding() { return inputEncoding; } /** * Whether the archive has been updated in this instance yet. */ public final AtomicBoolean firstRun = new AtomicBoolean(true); private volatile Date nextPeriodStart = new Date(0); private volatile String currentPeriod = ""; /** * This determines whether UpdateContext will lookup the current month * directly or do an index update if the month has changed. * * @param current */ public synchronized void setCurrentPeriod(String current) { this.currentPeriod = current; String format = "yyyy"; final Matcher m = PAT_PERIOD.matcher(current); boolean monthly = false; if (m.find()) { final String month = m.group(3); if (!month.isEmpty()) { monthly = true; format += "-MMMM"; } } SimpleDateFormat sdf = new SimpleDateFormat(format, Locale.ENGLISH); GregorianCalendar cal = new GregorianCalendar(); try { cal.setTime(sdf.parse(current)); } catch (ParseException e) { LOGGER.log(Level.WARNING, "Cannot parse date from: " + current, e); } cal.roll(monthly ? Calendar.MONTH : Calendar.YEAR, true); this.nextPeriodStart = cal.getTime(); } public synchronized Date nextPeriodStart() { return nextPeriodStart; } public synchronized String currentPeriod() { return currentPeriod; } public synchronized Locale locale() { return locale; } } /** * Tracks all configured archives. Key is the base URL of the archive. * declared package private for example documentation */ final Map archives = Collections .synchronizedMap(new HashMap()); /** * Receives {@linkplain Kick kicks} from {@linkplain Kicker} for this * Harvester. declared package private for example documentation */ @Warning("non-API") class MyKickReceiver implements KickReceiver { /** * Handles kick events from {@linkplain Kicker} * * @param kick * some {@linkplain UpdateKick} */ public void handle(Kick kick) { if (kick instanceof UpdateKick) { final UpdateKick updateKick = (UpdateKick) kick; if (!updateKick.archiveDesign().contains("Pipermail")) { return; } final String archiveUrl = updateKick.archiveUrl(); if (!archives.containsKey(archiveUrl)) { archives.put(archiveUrl, new Archive(archiveUrl)); } UpdateContext context = new UpdateContext(updateKick); context.run(); } } } /** * Construct a new harvester for pipermail. */ public PipermailHarvester() { hCache = HarvestCache.i(); Kicker.i().register(new MyKickReceiver()); aVerifier = new BasicAuthenticator(); stateTable = new StateTable(hCache.getDatabase()); try { if (!stateTable.exists()) { stateTable.create(); } } catch (SQLException e) { LOGGER.log(Level.SEVERE, "State table initialization error.", e); System.exit(1); } } /** * Provides update context. */ @Warning("non-API") abstract class ContextFetcher extends AbstractFetcher { protected final UpdateContext context; public ContextFetcher(final UpdateContext context, final String path) { super(context.archiveUrl(), path); this.context = context; } @Override public void fault(final String msg) { context.archive.setFaulty(msg); context.next(); } } /** * Context to run fetchers and synchronize state during an update. * */ @Warning("non-API") @ThreadSafe class UpdateContext implements Runnable { private final AtomicBoolean isStarted = new AtomicBoolean(false); private final List queue = Collections .synchronizedList(new LinkedList()); public final Marker endMarker; private volatile Marker startMarker; private final Archive archive; // kick objects private final String archiveUrl; private final PrintStream reportStream; private final HarvestReporter reporter; public UpdateContext(final UpdateKick kick) { this.archiveUrl = kick.archiveUrl(); this.archive = archives.get(archiveUrl); this.reporter = kick.reporter(); this.reportStream = kick.reporter().printStream(); Marker tempEndMarker = null; try { // ignore possibly stale temporary entries if (archive.firstRun.get()) { tempEndMarker = stateTable.getNewest(archiveUrl, StateTable.Type.PERM); } else { tempEndMarker = stateTable.getNewest(archiveUrl); } } catch (SQLException e) { LOGGER.log( Level.SEVERE, "Cannot access state storage, further processing is useless.", e); System.exit(1); } finally { endMarker = tempEndMarker; } } public synchronized String archiveUrl() { return archiveUrl; } public synchronized void startUpdate(final Marker m) throws SQLException { if (isStarted.get()) { // already running. return; } final String message = "Starting update to " + (endMarker.path().isEmpty() ? "beginning of archive " : endMarker.path()); LOGGER.fine(message); reportStream.println(message); // other fetcher has grabbed start url already if (!stateTable.put(archiveUrl, m) && !archive.firstRun.get()) { finished(); } startMarker = m; isStarted.set(true); archive.firstRun.set(false); } public synchronized void run() { AbstractFetcher fetcher; if (new Date().after(archive.nextPeriodStart())) { fetcher = new RootFetcher(this); } else { fetcher = new PeriodFetcher(archive.currentPeriod(), this); } runner.scheduleLast(fetcher); } public synchronized void failure(final String message) { reportStream.println("Archive: " + archiveUrl + " failed with: " + message); queue.clear(); reporter.proccessFinished(); } public synchronized void next() { if (archive.isFaulty()) { failure(archive.fault()); return; } if (!queue.isEmpty()) { Fetcher fetcher = queue.remove(0); if (!isStarted.get()) { reportStream.println("entering archive: " + fetcher.url()); } else { reportStream.println("harvesting: " + fetcher.url()); } runner.scheduleLast(fetcher); } else { finished(); } } public synchronized void finished() { if (isStarted.get()) { LOGGER.fine("Finishing update: " + endMarker.path()); try { stateTable.finish(archiveUrl, startMarker, endMarker); stateTable.update(archiveUrl); } catch (SQLException e) { LOGGER.log( Level.SEVERE, "Cannot save state, further processing will be useless.", e); System.exit(1); } } queue.clear(); // allow garbage collection reporter.proccessFinished(); } public synchronized Archive archive() { return archive; } } /** * An index fetcher for the whole pipermail archive, like * "http://mail.zelea.com/list/votorola/". Spawns new * {@linkplain PeriodFetcher month fetchers}. declared package private * for example documentation * * @see PeriodFetcher */ @Warning("non-API") class RootFetcher extends ContextFetcher { /** * Carries the context along. * * @param context */ public RootFetcher(final UpdateContext context) { super(context, ""); } public void run() { // wait for encoding and locale to be parsed // this only happens on the first run if(context.archive.locale()==null || context.archive.inputEncoding().isEmpty()) { runner.scheduleLast(this); return; } try { // newest first List periods = indexPage(getInputStream(), PAT_PERIOD, context.archive().inputEncoding()); final String thisMonth = periods.get(0); archives.get(archiveUrl()).setCurrentPeriod(thisMonth); // newest last Collections.reverse(periods); for (final String period : periods) { context.queue.add(0, new PeriodFetcher(period, context)); } } catch (Exception e) { final String msg = "Could not fetch root element: " + e.getMessage(); LOGGER.log(Level.WARNING, msg, e); context.archive.setFaulty(msg); } context.next(); } } /** * An index fetcher for one period like "2010-January/date.html" or * "2012/date.html". Spawns new {@linkplain MessageFetcher fetchers} * Fetching the "date.html" index allows us to schedule page fetches by * PipermailHarvester.PAT_SENTDATE. * declared package private for * example documentation * * @see RootFetcher * @see MessageFetcher */ @Warning("non-API") class PeriodFetcher extends ContextFetcher { public PeriodFetcher(final String period, final UpdateContext context) { super(context, period + "/date.html"); this.period = period; } private final String period; public void run() { try { // newest last List posts = indexPage(getInputStream(), PAT_POST, context.archive().inputEncoding()); // newest first Collections.reverse(posts); LinkedList cleanList = new LinkedList(); synchronized (context) { for (String post : posts) { if (context.endMarker.path() .equals(period + "/" + post)) { // finished when we hit the marker context.queue.clear(); // remove earlier months break; } cleanList.add(post); } // newest last Collections.reverse(cleanList); for (String post : cleanList) { context.queue.add(0, new MessageFetcher(context, period + "/" + post)); } } context.next(); } catch (Exception e) { final String msg = "Could not fetch period " + period + ": " + e.getMessage(); LOGGER.log(Level.WARNING, msg, e); context.archive().setFaulty(msg); } } } /** * A fetcher for a single page, like "2010-Jan/003882.html". Contains only * one message in pipermail. declared package private for example * documentation * * @see PeriodFetcher */ @Warning("non-API") class MessageFetcher extends ContextFetcher { /** * Timezone is parsed in English separately, * * @see run() */ private final SimpleDateFormat dateFormat; public MessageFetcher(final UpdateContext context, final String path) { super(context, path); final Locale locale = context.archive.locale(); if (locale == null) { context.failure("Locale is null"); dateFormat = null; return; } dateFormat = mailmanDateFormat(locale); if (dateFormat == null) { context.failure("Locale not supported: " + locale.getCountry()); } } /** * DateFormat adjustments taken from mailman 2.1.15 LC_MESSAGES * * @param locale * @return dateformat or null if locale is not supported */ private SimpleDateFormat mailmanDateFormat(final Locale locale) { final String format = "EEE MMM d HH:mm:ss yyyy"; DateFormatSymbols dfsFr = new DateFormatSymbols(locale); String[] months = null; String[] weekDays = null; final String code = locale.getLanguage(); if (code.equals(new Locale("fr").getLanguage())) { months = new String[] { "Jan", "Fév", "Mar", "Apr", "May", "Juin", "Juil", "Aou", "Sep", "Oct", "Nov", "Déc" }; // first needs to be empty for DateFormat, starting with sunday weekDays = new String[] { "", "Dim", "Lun", "Mar", "Mer", "Jeu", "Ven", "Sam" }; } if (code.equals(new Locale("es").getLanguage())) { months = new String[] { "Ene", "Feb", "Mar", "Abr", "May", "Jun", "Jul", "Ago", "Sep", "Oct", "Nov", "Dic" }; weekDays = new String[] { "", "Dom", "Lun", "Mar", "Mie", "Jue", "Vie", "Sab" }; } if (code.equals(new Locale("de").getLanguage())) { months = new String[] { "Jan", "Feb", "Mär", "Apr", "Mai", "Jun", "Jul", "Aug", "Sep", "Okt", "Nov", "Dez" }; weekDays = new String[] { "", "So", "Mo", "Di", "Mi", "Do", "Fr", "Sa" }; } // default? if (code.equals(new Locale("en").getLanguage())) { return new SimpleDateFormat(format, locale); } if (months == null || weekDays == null) { return null; } dfsFr.setShortWeekdays(weekDays); dfsFr.setShortMonths(months); return new SimpleDateFormat(format, dfsFr); } public void run() { final Archive archive = context.archive(); try { final BufferedReader in = new BufferedReader( new InputStreamReader(getInputStream(), archive.inputEncoding())); String author = ""; Date date = null; StringBuilder bodyB = new StringBuilder(); boolean inContent = false; for (String line = in.readLine(); line != null; line = in .readLine()) { Matcher m = null; if (inContent) { m = PAT_CONTENT_END.matcher(line); if (m.find()) { inContent = false; continue; } String clean = line.replaceAll("\\<.*?>", ""); // remove escaped tags clean = StringEscapeUtils.unescapeHtml4(clean); clean = clean.replaceAll("\\<.*?>", ""); if (clean.startsWith("___________________________")) { inContent = false; continue; } if (!clean.startsWith(">") && !clean.startsWith(""") && !clean.endsWith("wrote:") && !clean.startsWith("-------")) { bodyB.append(clean).append(" "); } continue; } // effectively ignores first line of content for detection // this is intended. m = PAT_CONTENT_START.matcher(line); if (m.find()) { inContent = true; continue; } m = PAT_AUTHOR.matcher(line); if (author.isEmpty() && m.find()) { Matcher m2 = PAT_AUTHOR2.matcher(line); if (!m2.find()) { // author on next line line = in.readLine(); } Matcher m3 = PAT_AUTHOR2.matcher(line); if (m3.find()) { author = m3.group(1); final String[] ATs = { "at", "en" /* spanish */}; for (final String AT : ATs) { author = author.replace(" " + AT + " ", "@"); } } continue; } m = PAT_SENTDATE.matcher(line); if (date == null && m.find()) { // Timezone is always localized in English by Mailman. // therefore we have to parse separately final String tz = m.group(5); // build date format in correct order final StringBuilder dateStringB = new StringBuilder(); final String weekDay = m.group(1); dateStringB.append(weekDay); // Find out where the day number is, second or third // field... Pattern NUM = Pattern.compile("\\d+"); String month; String dayOfMonth; if (NUM.matcher(m.group(2)).find()) { dayOfMonth = m.group(2); month = m.group(3); } else { dayOfMonth = m.group(3); month = m.group(2); } // reassemble dateStringB.append(" ").append(month); dateStringB.append(" ").append(dayOfMonth); dateStringB.append(" ").append(m.group(4)); // time dateStringB.append(" ").append(m.group(6)); // year final String dateString = dateStringB.toString(); dateFormat.setTimeZone(TimeZone.getTimeZone(tz)); date = dateFormat.parse(dateString); continue; } } final String summary = HarvestUtil.summarize(bodyB.toString()); final List diffUrls = hCache.findDiffUrls(bodyB .toString()); LOGGER.finest("new message parsed: " + url()); if (!diffUrls.isEmpty()) { final Message msg = Message.create(summary, author, archiveUrl(), path(), diffUrls, date); LOGGER.fine("new diff message parsed: " + msg.toString()); hCache.store(msg, aVerifier); } context.startUpdate(Marker.create(path(), date)); } catch (ParseException e) { final StringBuilder msgB = new StringBuilder() .append("Could not parse date: "); msgB.append(e.getMessage()); DateFormatSymbols dfs = dateFormat.getDateFormatSymbols(); msgB.append("\nLocalized week day strings: ").append( Arrays.toString(dfs.getShortWeekdays())); msgB.append("\nLocalized month strings: ").append( Arrays.toString(dfs.getShortMonths())); LOGGER.log(Level.WARNING, msgB.toString(), e); archive.setFaulty(msgB.toString()); } catch (SQLException e) { final String msg = "Database problem: " + e.getMessage(); LOGGER.log(Level.SEVERE, msg, e); archive.setFaulty(msg); } catch (IOException e) { // should never happen final String msg = "Bug: IO-Problem while reading page buffer: " + e.getMessage(); LOGGER.log(Level.SEVERE, msg, e); archive.setFaulty(msg); } catch (Exception e) { final String msg = "Bug: Something weird happened: " + e.getMessage(); LOGGER.log(Level.SEVERE, msg, e); archive.setFaulty(msg); } context.next(); } } /** * Any index type fetcher that can be defined by a {@linkplain Pattern * pattern} can use this routine. declared package private for example * documentation Order is by match on stream. */ private List indexPage(final InputStream is, final Pattern pat, final String encoding) { if (is == null) { LOGGER.log(Level.WARNING, "Tried to index a stream which is NULL. This is a bug!"); return new LinkedList(); } BufferedReader tempIn = null; try { tempIn = new BufferedReader(new InputStreamReader(is, encoding)); } catch (UnsupportedEncodingException e) { LOGGER.log(Level.WARNING, "Encoding '" + encoding + "' is not supported. This is a faulty Java setup!", e); return new LinkedList(); } final BufferedReader in = tempIn; LinkedList hits = new LinkedList(); try { for (String s = in.readLine(); s != null; s = in.readLine()) { Matcher m = pat.matcher(s); if (m.find()) { hits.add(m.group(1)); continue; } } } catch (IOException e) { LOGGER.log(Level.WARNING, "Cannot read the HTML input stream. This is a bug.", e); } finally { try { in.close(); } catch (IOException e) { LOGGER.log(Level.WARNING, "Cannot class the HTML input stream. This is a bug.", e); } } return hits; } }