require 'rubygems'
require 'net/http'
require 'uri'
def getbody(letter, pagenumber)
url = URI.parse('http://www.somesite.com/')
res = Net::HTTP.start(url.host, url.port) { |http|
http.get("/browse.php?character=#{letter}&page=#{pagenumber}")
}
gbody = res.body
slice1 = gbody.index("columns")
slice2 = gbody.rindex("<center>")
gbody = gbody.slice(slice1, (slice2-slice1+150))
return gbody
end
def getpages(htmlbody)
pagebody = htmlbody.slice(htmlbody.index("<center>"),50)
slice1 = pagebody.index("<b>")
slice2 = pagebody.index("</b>")
pages = pagebody.slice(slice1+3, (slice2-slice1-3))
pages = Integer(pages)
return pages
end
alphabet = %w(a b c d e f g h i j k l m n o p q r s t u v w x y z *)
alphacounter = 0;
allwords = Array.new
out = File.new('wordlist.html', 'w')
out.puts "<html>"
while alphacounter < alphabet.length
counter = 1
cycles = 1
while counter <= cycles
body = getbody(alphabet[alphacounter],(counter))
if counter == 1
if alphabet[alphacounter] == "s"
cycles = 1010
else
cycles = getpages(body)
end
puts "cycles: #{cycles}"
end
counter += 1
wordbody = body.slice(0,body.index("</table>"))
peices = Array.new
innercounter = 0;
while (wordbody.include? "<a href")
slice1 = wordbody.index("<a href")
slice2 = wordbody.index("</a>")
peices[innercounter] = wordbody.slice(slice1,(slice2-slice1+4))
innercounter += 1
wordbody = wordbody.slice((slice2+4),wordbody.length)
end
peices.compact!
i = 0
while i < peices.length
peices[i].insert((peices[i].index("=")+2),"http://www.somesite.com")
i += 1
end
peices.each do |item|
out.puts "#{item} <br>"
end
puts "Letter: #{alphabet[alphacounter]} page: #{counter} total pages: #{cycles}"
end
alphacounter += 1
end
out.puts "</html>"
Refactorings
No refactoring yet !
danielharan
May 31, 2008, May 31, 2008 17:27, permalink
You're mixing too many concerns. In particular, separating the scraping from the processing is usually a huge win for readability; I save fetched pages in /tmp/cache. That way if I mess up the retrieval of data from the HTML (which happens, oh, always), I don't have to hit their servers again.
If there are two levels of pages to retrieve - search listings and actual data pages, you can write two different scrapers.
# why does the alphabet include '*' anyways?
(('a'..'z').to_a << '*').each do |letter|
`curl|request somesite.com/search?q=#{letter} to /tmp/cache/searches/#{letter}`
end
elliottcable
June 9, 2008, June 09, 2008 16:21, permalink
You might also want to look at Tempfile: http://www.ruby-doc.org/stdlib/libdoc/tempfile/rdoc/index.html
Using built-in Ruby tools > throwing it out to a system call d-:
Tempfile.new do |f| # Grab page, and save it here - then pass the tempfile on to the next section end
perfection is reached not when there is nothing left to add, but when there is nothing left to take away...