#!/usr/bin/env ruby
require 'rubygems'
require 'hpricot'
require 'icalendar'
require 'date'
class ATPCalendar
def initialize(source)
@source = source
if @source =~ /^http/ # if source = website
@hp = Hpricot(open(@source))
else # if source = file
@hp = open(@source) { |f| Hpricot(f) }
end
prettify_output(parse_resource(@hp))
# some debugging
#(1..3).each do |t|
# puts @tourneys[t].inspect
#end
end
def parse_resource(doc)
@rows = []
# rows array will contain the raw data of evey event
(doc/"div.maincolwide//table[4]//tr").each do |row|
cells = []
(row/"td").each do |cell|
next if cell.inner_text == "" # skip empty cells
next if cell.inner_text =~ /\302\240/
raw = cell.inner_text # we just want the text
raw.gsub!(/\t/,'') # remove funny formatting
#raw.gsub!(/\\/,'') # remove funny formatting
raw.strip!
cells << raw
end
@rows << cells unless cells.size == 0
end
@rows
end
def build_ical(filename)
cal = Icalendar::Calendar.new
@tourneys.each do |t|
event = cal.event
event.start = t[:date]
event.summary = t[:tourney]
event.description = "Surface: #{t[:surface]}; Winners: #{t[:winners]}"
event.location = t[:location]
cal.add_event(event)
end
cal_file = File.new(File.join(Dir.getwd, filename), "w+")
cal_file.puts cal.to_ical
end
private
# create nicely formatted hashes out
# of each row of tournament info
def prettify_output(rows)
@items = rows
@tourneys = []
rows.each_with_index do |row, index|
loc = rows[index][1].split("\n") # location and tourney
tourney = {
:date => Date.parse(@items[index][0]),
:location => loc[0],
:tourney => loc[1].sub(/(\w)ATP/,"\\1\nATP"),
:surface => @items[index][2],
:price => @items[index][3],
:tickets => @items[index][4],
:winners => @items[index][5] || "no info available" }
@tourneys << tourney
end
end
end
calendar = ATPCalendar.new("atp_cal2009.html")
calendar.build_ical("atp_tourneys_2009.ics")
Refactorings
No refactoring yet !
Marc-Andre
April 20, 2009, April 20, 2009 17:49, permalink
Here's a prettified prettify_output
def prettify_output(rows)
rows.map do |date, location, surface, price, tickets, winners|
location, tourney = location.split("\n")
{
:date => Date.parse(date),
:location => location,
:tourney => tourney.sub(/(\w)ATP/,"\\1\nATP"),
:surface => surface,
:price => price,
:tickets => tickets,
:winners => winners || "no info available"
}
end
end
# and modify initialize to set @tourneys:
@tourneys = prettify_output(parse_resource(@hp))
The official ATP site has a table showing general information of their 2009 tennis tournaments. The following script scrapes the information and builds an iCalendar. The code doesn't feel tidy at all, so any tips for improvement are greatly appreciated.
Note: the script works with a downloaded version of the ATP site in question.
Here is the site for further details on the script:
http://intothespirit.com/blog/2009/04/07/atp-2009-tournaments.html