#!/bin/bash # # wiki-copy/wget - Make a static copy of the pollwiki-cum-streetwiki with GNU Wget # # * Wget manual - https://www.gnu.org/software/wget/manual/wget.html # * Wget configuration - /etc/wgetrc, ~/.wgetrc - no settings are wanted or expected # * Other helpful advice - http://camwebb.info/blog/2012-12-20/ # # Copyright 2018, Michael Allan. Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Votorola Software"), to deal in the Votorola Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicence, and/or sell copies of the Votorola Software, and to permit persons to whom the Votorola Software is furnished to do so, subject to the following conditions: The preceding copyright notice and this permission notice shall be included in all copies or substantial portions of the Votorola Software. THE VOTOROLA SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE VOTOROLA SOFTWARE OR THE USE OR OTHER DEALINGS IN THE VOTOROLA SOFTWARE. log='/tmp/wiki-copy-wget.log' rm --force $log # clearing the log file, to start with a fresh one # ===================== # Define the main parts to copy # ===================== # r='^http://obsidian/' # # This name might be more robust than reluk.ca/ in case the gateway goes down during the process. ##### No it would not; the wiki content itself loads explicitly from reluk.ca/, regardless. r='^http://reluk\.ca/' r+='(?' # Components of pages # ------------------- r+=':mediawiki/(?' # $wgScriptPath [WG] r+=':extensions/' r+='|images/' r+='|load\.php' r+='|skins/' r+=')' # Pages whole # ----------- r+='|(?' # principal pages # - - - - - - - - r+=':w' # $wgArticlePath # secondary pages # - - - - - - - - r+='|mediawiki/index\.php(?!.*[?&](?' # $wgScriptPath ### omitting these ones ### r+=':action=(?!edit|history|info|raw)' # no actions but these # raw : In order to serve required resources, such as: # /mediawiki/index.php?title=MediaWiki:Gadget-ReferenceTooltips.js&action=raw&ctype=text/javascript&5395 r+='|feed=' # no news feeds r+='|limit=(?!50\b)' # no length variants for the list pages; only the default (50), # which is specified explicitly in back-and-forth paging requests r+='|oldid=' # no earlier revisions of pages r+='|printable=' # no print-friendly pages r+='|redlink=' # no placeholder pages for the targets of broken links r+='))' r+=')' r+='(?' # generally omitting special pages # - - - - - - - - - - - - - - - - - r+='!.+Special(?:%3A|:)' # [3A] r+='(?' ### keeping just these ones ### r+='!AllPages' r+='|Browse(?!.*[?&]dir=)' # omitting filtered variants r+='|Categories' # r+='|Contributions' # costly in copy time r+='|ListFiles' r+='|ListUsers' r+='|PrefixIndex' r+='|Properties' r+='|SearchByProperty(?!/Modification-20date)' # omitting the special # *Modification date* property, which has very many pages here r+='|SemanticStatistics' r+='|SpecialPages' r+='|Statistics' r+='|Types' r+='|UnusedProperties' r+='|Version' r+='|WantedProperties' r+='|WhatLinksHere(?!.*[?&]hide\w+=)' # omitting filtered variants r+=')' r+=')' r+=')' # =================== # Copy the main parts # =================== opts='' # general options for the *wget* calls, below opts+=' --execute robots=off' # Avoid gaps in the copy due to robot exclusions by the server. # The server is local and none of its exclusions is relevant in this case. opts+=' --no-host-directories' opts+=' --no-verbose' # opts+=' --timestamping' # In case any page is served with a timestamp, and the data turns out to be useful someday. # But then it issues warnings for all pages not so served, cluttering the output. opts+=' --wait=0.2' nice wget $opts \ --accept-regex=$r \ --adjust-extension \ --convert-links \ --level=inf \ --output-file=$log \ --recursive \ --regex-type=pcre \ http://reluk.ca/w/Wiki:Main_page # # By recursively traversing the links of the main page, this call does most of the copying work. # # --adjust-extension # # Else it omits directory index pages. https://superuser.com/q/790039 # Else it omits some directories entire, such as http://reluk.ca/w/Tor/assorted_division, # complaining "w/Tor/assorted_division: Not a directory". # # --level # # Use '0' or 'inf' for an unlimited depth. This option is not well documented. opts+=' --append-output='$log # ==================== # Add Special:FilePath # ==================== # A wiki script uses it. http://reluk.ca/sys/host/obsidian/home/v/votorola/web/wiki/pollwiki.js # Add only the children - FilePath/* -. FilePath itself would be useless as a directory index. # Rather let the web server generate a default index. nice wget $opts \ http://reluk.ca/w/Special:FilePath/Assorted-map-small.png \ http://reluk.ca/w/Special:FilePath/BCDiv-small.png \ http://reluk.ca/w/Special:FilePath/BCDiv.png \ http://reluk.ca/w/Special:FilePath/BCLogo.png \ http://reluk.ca/w/Special:FilePath/CanDiv-small.png \ http://reluk.ca/w/Special:FilePath/CanDiv.png \ http://reluk.ca/w/Special:FilePath/CanLogo.png \ http://reluk.ca/w/Special:FilePath/City-wide-map-small.png \ http://reluk.ca/w/Special:FilePath/City-wide-map.png \ http://reluk.ca/w/Special:FilePath/Coloradio.png \ http://reluk.ca/w/Special:FilePath/Communikative-delegation-englisch.png \ http://reluk.ca/w/Special:FilePath/Conch_drawing.jpg \ http://reluk.ca/w/Special:FilePath/DDLogo.png \ http://reluk.ca/w/Special:FilePath/DeLogo.png \ http://reluk.ca/w/Special:FilePath/Delegierung-nicht-kommunikativ-innerhalb.png \ http://reluk.ca/w/Special:FilePath/Dieter_althaus.png \ http://reluk.ca/w/Special:FilePath/Differencefeed_votorola_a_diff_harvest_classes.png \ http://reluk.ca/w/Special:FilePath/Followdiscussions.png \ http://reluk.ca/w/Special:FilePath/GDiv-small.png \ http://reluk.ca/w/Special:FilePath/GDiv.png \ http://reluk.ca/w/Special:FilePath/GLogo.png \ http://reluk.ca/w/Special:FilePath/Götz_werner.png \ http://reluk.ca/w/Special:FilePath/IPLogo.png \ http://reluk.ca/w/Special:FilePath/Kommunikative-delegation-wiki.png \ http://reluk.ca/w/Special:FilePath/Kommunikative-delegation.png \ http://reluk.ca/w/Special:FilePath/LiqdLogo.png \ http://reluk.ca/w/Special:FilePath/MADiv-small.png \ http://reluk.ca/w/Special:FilePath/MADiv.png \ http://reluk.ca/w/Special:FilePath/NYCDiv-small.png \ http://reluk.ca/w/Special:FilePath/NYCDiv.png \ http://reluk.ca/w/Special:FilePath/NYCLogo.png \ http://reluk.ca/w/Special:FilePath/NYC_GA.png \ http://reluk.ca/w/Special:FilePath/NYC_GA_call-to-action-small.jpg \ http://reluk.ca/w/Special:FilePath/NYC_GA_call-to-action.jpg \ http://reluk.ca/w/Special:FilePath/NYC_GA_demands-small.jpg \ http://reluk.ca/w/Special:FilePath/NYC_GA_demands.jpg \ http://reluk.ca/w/Special:FilePath/NYC_GA_ovn-small.jpg \ http://reluk.ca/w/Special:FilePath/NYC_GA_ovn.jpg \ http://reluk.ca/w/Special:FilePath/NYC_assorted_City-wide-small.png \ http://reluk.ca/w/Special:FilePath/NYC_assorted_City-wide.png \ http://reluk.ca/w/Special:FilePath/Non-communicative-delegation-englisch.png \ http://reluk.ca/w/Special:FilePath/OVN_authority.png \ http://reluk.ca/w/Special:FilePath/P2PLogo.png \ http://reluk.ca/w/Special:FilePath/PPDeLogo.png \ http://reluk.ca/w/Special:FilePath/Pinkwart.png \ http://reluk.ca/w/Special:FilePath/PollPageNav-poll-disabled.png \ http://reluk.ca/w/Special:FilePath/PollPageNav-poll-link.png \ http://reluk.ca/w/Special:FilePath/PollPageNav-poll.png \ http://reluk.ca/w/Special:FilePath/PollPageNav-position-disabled.png \ http://reluk.ca/w/Special:FilePath/PollPageNav-position-link.png \ http://reluk.ca/w/Special:FilePath/PollPageNav-position.png \ http://reluk.ca/w/Special:FilePath/PollPageNav-voting-disabled.png \ http://reluk.ca/w/Special:FilePath/PollPageNav-voting-link.png \ http://reluk.ca/w/Special:FilePath/PollPageNav-voting.png \ http://reluk.ca/w/Special:FilePath/Pollwiki-pollserver-voteprocessing.png \ http://reluk.ca/w/Special:FilePath/Riding-map-small.png \ http://reluk.ca/w/Special:FilePath/Riding-map.png \ http://reluk.ca/w/Special:FilePath/School-ward-14-map-small.png \ http://reluk.ca/w/Special:FilePath/School-ward-map-small.png \ http://reluk.ca/w/Special:FilePath/School-ward-map.png \ http://reluk.ca/w/Special:FilePath/SiluetteMan.png \ http://reluk.ca/w/Special:FilePath/SiluetteWoman.png \ http://reluk.ca/w/Special:FilePath/SysDefaultDiv-small.png \ http://reluk.ca/w/Special:FilePath/SysLogo.png \ http://reluk.ca/w/Special:FilePath/TorLogo.png \ http://reluk.ca/w/Special:FilePath/Trinity-spadina-map-small.png \ http://reluk.ca/w/Special:FilePath/Trinity-spadina-map.png \ http://reluk.ca/w/Special:FilePath/USDiv-small.png \ http://reluk.ca/w/Special:FilePath/USDiv.png \ http://reluk.ca/w/Special:FilePath/USLogo.png \ http://reluk.ca/w/Special:FilePath/Unbounded-map-small.png \ http://reluk.ca/w/Special:FilePath/Unbounded-map.png \ http://reluk.ca/w/Special:FilePath/Vote_mirroring-Collab-pull.png \ http://reluk.ca/w/Special:FilePath/Vote_mirroring-Collab-push.png \ http://reluk.ca/w/Special:FilePath/Votorola-logo.png \ http://reluk.ca/w/Special:FilePath/Votorola_nMy_MOP.png \ http://reluk.ca/w/Special:FilePath/WPELogo.png \ http://reluk.ca/w/Special:FilePath/Ward-19-map-small.png \ http://reluk.ca/w/Special:FilePath/Ward-19-map.png \ http://reluk.ca/w/Special:FilePath/Ward-20-map-small.png \ http://reluk.ca/w/Special:FilePath/Ward-20-map.png \ http://reluk.ca/w/Special:FilePath/Ward-map-small.png \ http://reluk.ca/w/Special:FilePath/Ward-map.png # ======================= # Add more resource loads # ======================= # Wiki scripts will request these, though they are not entailed in the main part of the copy. # Seeing that the main part often includes German versions (lang=de), I also include them here. q='http://reluk.ca/mediawiki/load.php?debug=false&lang=' r='&modules=' s1='jquery%2Cmediawiki&only=scripts&skin=vector&version=20130626T163729Z' s2='jquery.client%2Ccookie%2CmwExtension%7Cmediawiki.legacy.ajax%2Cwikibits%7Cmediawiki.notify%2Cutil%7Cmediawiki.page.startup&skin=vector&version=20160525T224015Z&*' s3='ext.gadget.ReferenceTooltips%7Cjquery.autoEllipsis%2CcheckboxShiftClick%2Chidpi%2ChighlightText%2CmakeCollapsible%2Cmw-jump%2Cplaceholder%2Csuggestions%7Cmediawiki.action.view.postEdit%7Cmediawiki.api%2Chidpi%2CsearchSuggest%2Cuser%7Cmediawiki.page.ready%7Cskins.vector.js&skin=vector&version=20160525T224016Z&*' nice wget $opts \ $q'de'$r$s1 $q'en'$r$s1 \ $q'de'$r$s2 $q'en'$r$s2 \ $q'de'$r$s3 $q'en'$r$s3 ## - N o t e s ----------------------------------------------------------------------------------------- # [3A] · The form Special%3A (as opposed to Special:) covers encoded page URLs such as # http://reluk.ca/mediawiki/index.php?title=Special%3APrefixIndex&namespace=14. # # [WG] · The various '$wg' variables are from LocalSettings.php, the wiki's configuration file.