def render_page_sitemap(scope, sitemapname,statics, machines, lang)
file = RAILS_ROOT + "/public/"+ sitemapname+"_"+lang+".xml.gz"
Zlib::GzipWriter.open(file) do |gzip|
xml = Builder::XmlMarkup.new(:target => gzip)
xml.instruct!
xml.urlset("xmlns:xsi" => 'http://www.w3.org/2001/XMLSchema-instance', "xsi:schemaLocation" =>"http://www.sitemaps.org/schemas/sitemap/0.9 http://www.sitemaps.org/schemas/sitemap/0.9/sitemap.xsd", "xmlns" => "http://www.sitemaps.org/schemas/sitemap/0.9") do
statics.each do |page|
xml.url do
xml.loc(url_for(:only_path => false, :controller => "homepages", :action => page, :lang => lang, :scope => scope))
xml.lastmod(Time.now.strftime('%Y-%m-%d'))
xml.changefreq("daily")
end
end
machines.each do |mach|
xml.url do
xml.loc(url_for(:only_path => false, :controller => "homepages", :action => 'showmach', :id => mach, :lang => lang, :scope => scope))
xml.lastmod(DateTime.strptime(mach.created, '%Y-%m-%d %H:%M:%S').strftime("%Y-%m-%dT%H:%M:%S+00:00"))
xml.changefreq("monthly")
end
end
end #XML
gzip.close
end #GZIP
end
def render_overall_sitemap(sitemapname, baseurl)
file = RAILS_ROOT + "/public/overall_"+sitemapname+".xml.gz"
Zlib::GzipWriter.open(file) do |gzip|
xml = Builder::XmlMarkup.new(:target => gzip)
xml.instruct!
xml.sitemapindex("xmlns:xsi" => 'http://www.w3.org/2001/XMLSchema-instance', "xsi:schemaLocation" =>"http://www.sitemaps.org/schemas/sitemap/0.9 http://www.sitemaps.org/schemas/sitemap/0.9/siteindex.xsd", "xmlns" => "http://www.sitemaps.org/schemas/sitemap/0.9") do
xml.sitemap do
xml.loc("http://#{baseurl}/#{sitemapname}_de.xml.gz")
xml.lastmod(Time.now.strftime('%Y-%m-%d'))
end
xml.sitemap do
xml.loc("http://#{baseurl}/#{sitemapname}_en.xml.gz")
xml.lastmod(Time.now.strftime('%Y-%m-%d'))
end
xml.sitemap do
xml.loc("http://#{baseurl}/#{sitemapname}_zh.xml.gz")
xml.lastmod(Time.now.strftime('%Y-%m-%d'))
end
xml.sitemap do
xml.loc("http://#{baseurl}/#{sitemapname}_ru.xml.gz")
xml.lastmod(Time.now.strftime('%Y-%m-%d'))
end
end #XML
gzip.close
end #GZIP
end
task :generate_sitemap => :environment do
include ActionController::UrlWriter # for url_for
# IMTM Sitemaps
default_url_options[:host] = 'www.imtm.com'
staticpages = Array.new
staticpages << "index"
staticpages << "impressum"
staticpages << "kontakt"
staticpages << "wir"
machines = Machine.find(:all, :select => "machines.id, machines.created, model, manufacturer_id", :include => [:manufacturer])
render_page_sitemap("imtm", "imtm_sitemap",staticpages, machines, "de")
render_page_sitemap("imtm", "imtm_sitemap",staticpages, machines, "en")
render_page_sitemap("imtm", "imtm_sitemap",staticpages, machines, "zh")
render_page_sitemap("imtm", "imtm_sitemap",staticpages, machines, "ru")
render_overall_sitemap("imtm_sitemap", "www.imtm.com")
# globus Sitemaps
default_url_options[:host] = 'www.globus-trading.com'
staticpages = Array.new
staticpages << "index"
staticpages << "anfahrt"
staticpages << "anfrage"
staticpages << "downloads"
staticpages << "impressum"
staticpages << "kontakt"
staticpages << "produktionslinien"
staticpages << "wir"
machines = Machine.find(:all, :select => "machines.id, machines.created, model, manufacturer_id", :conditions => ["globushp = 1 OR globusspecial = 1 OR globusstart = 1 OR globusstartspecial = 1"], :include => [:manufacturer])
render_page_sitemap("globus", "globus_sitemap",staticpages, machines, "de")
render_page_sitemap("globus", "globus_sitemap",staticpages, machines, "en")
render_page_sitemap("globus", "globus_sitemap",staticpages, machines, "zh")
render_page_sitemap("globus", "globus_sitemap",staticpages, machines, "ru")
render_overall_sitemap("globus_sitemap", "www.globus-trading.com")
end
Refactorings
No refactoring yet !
Ants
February 15, 2010, February 15, 2010 09:01, permalink
In the cluster of calls to render_page_sitemap() it looks like the only thing that varies is the lang parameter. The lang parameter is only used to build up an output filename, as well as the URLs. Why not call the function once and have it generate its output with place holders, and then just iterate later to replace the placeholders with the appropriate language?
Also, the xml.lastmod(...) computations are done each time for every URL, probably can be computed just once if you don't mind a little bit of inaccuracy that could be off by just a few minutes.
tiekuhn
February 23, 2010, February 23, 2010 23:02, permalink
Hi Ants,
you definitly gave me the right hints :-). I finally could isolate the "slow part" of the whole script!
somehow the generation of the "lastmod" timestamps based on my DB records took an awfull lot of time!
So I replaced this part with a string based alternative --> run time of the script decreased from > 10 min to ~1-2 min!!!
for the changed code, see below:
thanks again very much for your help!
SLOOOOOOW:
xml.lastmod(DateTime.strptime(mach.created, '%Y-%m-%d %H:%M:%S').strftime("%Y-%m-%dT%H:%M:%S+00:00"))
MUCH faster:
xml.lastmod("#{mach.created.tr(' ','T')}+00:00")
tiekuhn
February 23, 2010, February 23, 2010 23:03, permalink
Hi Ants,
you definitly gave me the right hints :-). I finally could isolate the "slow part" of the whole script!
somehow the generation of the "lastmod" timestamps based on my DB records took an awfull lot of time!
So I replaced this part with a string based alternative --> run time of the script decreased from > 10 min to ~1-2 min!!!
for the changed code, see below:
thanks again very much for your help!
SLOOOOOOW:
xml.lastmod(DateTime.strptime(mach.created, '%Y-%m-%d %H:%M:%S').strftime("%Y-%m-%dT%H:%M:%S+00:00"))
MUCH faster:
xml.lastmod("#{mach.created.tr(' ','T')}+00:00")
tiekuhn
February 23, 2010, February 23, 2010 23:55, permalink
to speed up things even more I rewrote the rake task in perl
--> run time < 1 sec!!!
but beware....i'm a newby in perl --> code is really ugly and could need lots of improvement :-) so if you want to help....you're more then welcome
#!/usr/bin/perl
use Mysql;
# MYSQL CONFIG VARIABLES
$host = "localhost:3306";
$database = "somedb";
$user = "someuser";
$pw = "somepw";
# PERL MYSQL CONNECT()
$connect = Mysql->connect($host, $database, $user, $pw);
# SELECT DB
$connect->selectdb($database);
# DEFINE A VARIABLES
$imtmquery = "SELECT machines.id, manufacturers.name, machines.model, machines.created FROM machines LEFT JOIN manufacturers ON machines.manufacturer_id = manufacturers.id";
@imtmstatics = ("blaa", "blubb");
$globusquery = "SELECT machines.id, manufacturers.name, machines.model, machines.created FROM machines LEFT JOIN manufacturers ON machines.manufacturer_id = manufacturers.id WHERE globushp = 1 OR globusspecial = 1 OR globusstart = 1 OR globusstartspecial = 1";
@globusstatics = ("blaa", "blubb");
$xmlheader = '<?xml version="1.0" encoding="UTF-8" ?><urlset xsi:schemaLocation="http://www.sitemaps.org/schemas/sitemap/0.9 http://www.sitemaps.org/schemas/sitemap/0.9/sitemap.xsd" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">';
$xmlfooter = '</urlset>';
sub print_machine_link
{
return "<url>
<loc>http://www.".$_[0]."/".$_[5]."/showmach/".$_[1]."-".$_[2]."-".$_[3]."</loc>
<lastmod>".$_[4]."+00:00</lastmod>
<changefreq>monthly</changefreq>
</url>";
}
sub print_page_link
{
return "<url>
<loc>http://www.".$_[0]."/".$_[1]."/".$_[2]."</loc>
<lastmod></lastmod>
<changefreq>monthly</changefreq>
</url>";
}
open (IMTMDE, '>imtm_sitemap_de.xml');
open (IMTMEN, '>imtm_sitemap_en.xml');
open (IMTMZH, '>imtm_sitemap_zh.xml');
open (IMTMRU, '>imtm_sitemap_ru.xml');
print IMTMDE $xmlheader;
print IMTMEN $xmlheader;
print IMTMZH $xmlheader;
print IMTMRU $xmlheader;
# RENDER STATIC IMTM PAGES
for ($i=0;$i<@imtmstatics;$i++){
print IMTMDE print_page_link("imtm.com", "de", $imtmstatics[$i]);
print IMTMEN print_page_link("imtm.com", "en", $imtmstatics[$i]);
print IMTMZH print_page_link("imtm.com", "zh", $imtmstatics[$i]);
print IMTMRU print_page_link("imtm.com", "ru", $imtmstatics[$i]);
}
# EXECUTE IMTM QUERY
$execute = $connect->query($imtmquery);
while (@results = $execute->fetchrow()) {
$manufacturer = lc($results[1]);
$model = lc($results[2]);
$timestamp = $results[3];
$manufacturer =~ tr/A-Za-z_0-9/-/c;
$model =~ tr/A-Za-z_0-9/-/c;
$timestamp =~ tr/ /T/;
print IMTMDE print_machine_link("imtm.com", $results[0], $manufacturer, $model, $timestamp, "de");
print IMTMEN print_machine_link("imtm.com", $results[0], $manufacturer, $model, $timestamp, "en");
print IMTMZH print_machine_link("imtm.com", $results[0], $manufacturer, $model, $timestamp, "zh");
print IMTMRU print_machine_link("imtm.com", $results[0], $manufacturer, $model, $timestamp, "ru");
}
print IMTMDE $xmlfooter;
print IMTMEN $xmlfooter;
print IMTMZH $xmlfooter;
print IMTMRU $xmlfooter;
close (IMTMDE);
close (IMTMEN);
close (IMTMZH);
close (IMTMRU);
open (GLOBUSDE, '>globus_sitemap_de.xml');
open (GLOBUSEN, '>globus_sitemap_en.xml');
open (GLOBUSZH, '>globus_sitemap_zh.xml');
open (GLOBUSRU, '>globus_sitemap_ru.xml');
print GLOBUSDE $xmlheader;
print GLOBUSEN $xmlheader;
print GLOBUSZH $xmlheader;
print GLOBUSRU $xmlheader;
# RENDER STATIC Globus PAGES
for ($i=0;$i<@globusstatics;$i++){
print GLOBUSDE print_page_link("globus-trading.com", "de", $globusstatics[$i]);
print GLOBUSEN print_page_link("globus-trading.com", "en", $globusstatics[$i]);
print GLOBUSZH print_page_link("globus-trading.com", "zh", $globusstatics[$i]);
print GLOBUSRU print_page_link("globus-trading.com", "ru", $globusstatics[$i]);
}
# EXECUTE Globus QUERY
$execute = $connect->query($globusquery);
while (@results = $execute->fetchrow()) {
$manufacturer = lc($results[1]);
$model = lc($results[2]);
$manufacturer =~ tr/A-Za-z_0-9/-/c;
$model =~ tr/A-Za-z_0-9/-/c;
$timestamp = $results[3];
$timestamp =~ tr/ /T/;
print GLOBUSDE print_machine_link("globus-trading.com", $results[0], $manufacturer, $model, $timestamp, "de");
print GLOBUSEN print_machine_link("globus-trading.com", $results[0], $manufacturer, $model, $timestamp, "en");
print GLOBUSZH print_machine_link("globus-trading.com", $results[0], $manufacturer, $model, $timestamp, "zh");
print GLOBUSRU print_machine_link("globus-trading.com", $results[0], $manufacturer, $model, $timestamp, "ru");
}
print GLOBUSDE $xmlfooter;
print GLOBUSEN $xmlfooter;
print GLOBUSZH $xmlfooter;
print GLOBUSRU $xmlfooter;
close (GLOBUSDE);
close (GLOBUSEN);
close (GLOBUSZH);
close (GLOBUSRU);
system "gzip imtm_sitemap_de.xml";
system "gzip imtm_sitemap_en.xml";
system "gzip imtm_sitemap_zh.xml";
system "gzip imtm_sitemap_ru.xml";
system "gzip globus_sitemap_de.xml";
system "gzip globus_sitemap_en.xml";
system "gzip globus_sitemap_zh.xml";
system "gzip globus_sitemap_ru.xml";
Hi,
I'm looking for a way to speed up mail RoR Rake Task below :
(start reading the code at "task :generate_sitemap => :environment")
### The Problem is: ###
the Sitemaps I'm generating are very large (> 35000 URLs) and each of the Sitemaps comes in 4 languages
--> Sitemap generation takes several minutes
### Optimizations I've already done ###
- installed the "fast_xs" gem to speed up the whole rails XML Builder
- "pipeing" the generated xml directly into the gzip files to (hopefully) save some memory
(see "xml = Builder::XmlMarkup.new(:target => gzip)")
Does any of you have any other clever idea how to speed up the code? I'd really appreciate any sort of help :-)