#!/usr/bin/ruby
################################################################################
## File: downloadWikiPage.rb
## Author: Henry A. Feild (hfeild @ cs . umass . edu)
## Date: 06-Jul-2011
usage = "
Usage: downloadWikiPage.rb
Downloads the wikipedia page with the given title in JSON format to a file
called .json.
"
################################################################################
require 'rubygems'
require 'json'
require 'net/http'
require 'uri'
DEFAULT_FORMAT = "json"
####
## Returns the URI to access the Wikipedia page with the specified title
## in the given format.
##
## title:: The title
## format:: (Optional) The format (see http://www.mediawiki.org/wiki/API for
## more information.
##
## Return:: A URI object for the Wikipedia API.
####
def toLink( title, format=DEFAULT_FORMAT )
return URI.parse( "http://en.wikipedia.org/w/api.php?titles=" +
title + "&action=query&rvprop=content&prop=revisions&format=" +
format + "&redirects=1" )
end
## Check the args -- there should be at least one (the title).
if ARGV.size < 1
STDERR.puts usage
exit
end
title = ARGV.shift
## Get the wiki text in JSON.
text = JSON.parse( Net::HTTP.get( toLink( title, "json" ) ) )
## Open the output file; we're just using the title as the
## file name for now.
fd = File.open( title, "w" )
## From the JSON, get a hold of the actual wiki text content.
## (If you navigate to the api URL in a browser and specify the
## format as: format=jsonfmt, you can see a pretty print of
## the JSON structure).
page = text['query']['pages'].keys[0]
fd.puts text['query']['pages'][page]['revisions'][0]['*']
fd.close