#!/usr/bin/ruby ################################################################################ ## File: downloadWikiPage.rb ## Author: Henry A. Feild (hfeild @ cs . umass . edu) ## Date: 06-Jul-2011 usage = " Usage: downloadWikiPage.rb Downloads the wikipedia page with the given title in JSON format to a file called <title>.json. " ################################################################################ require 'rubygems' require 'json' require 'net/http' require 'uri' DEFAULT_FORMAT = "json" #### ## Returns the URI to access the Wikipedia page with the specified title ## in the given format. ## ## title:: The title ## format:: (Optional) The format (see http://www.mediawiki.org/wiki/API for ## more information. ## ## Return:: A URI object for the Wikipedia API. #### def toLink( title, format=DEFAULT_FORMAT ) return URI.parse( "http://en.wikipedia.org/w/api.php?titles=" + title + "&action=query&rvprop=content&prop=revisions&format=" + format + "&redirects=1" ) end ## Check the args -- there should be at least one (the title). if ARGV.size < 1 STDERR.puts usage exit end title = ARGV.shift ## Get the wiki text in JSON. text = JSON.parse( Net::HTTP.get( toLink( title, "json" ) ) ) ## Open the output file; we're just using the title as the ## file name for now. fd = File.open( title, "w" ) ## From the JSON, get a hold of the actual wiki text content. ## (If you navigate to the api URL in a browser and specify the ## format as: format=jsonfmt, you can see a pretty print of ## the JSON structure). page = text['query']['pages'].keys[0] fd.puts text['query']['pages'][page]['revisions'][0]['*'] fd.close