175ab087b0647367da3d370fbf5bc024

Hi,

I'm looking for a way to speed up mail RoR Rake Task below :
(start reading the code at "task :generate_sitemap => :environment")

### The Problem is: ###
the Sitemaps I'm generating are very large (> 35000 URLs) and each of the Sitemaps comes in 4 languages
--> Sitemap generation takes several minutes

### Optimizations I've already done ###
- installed the "fast_xs" gem to speed up the whole rails XML Builder
- "pipeing" the generated xml directly into the gzip files to (hopefully) save some memory
(see "xml = Builder::XmlMarkup.new(:target => gzip)")

Does any of you have any other clever idea how to speed up the code? I'd really appreciate any sort of help :-)

def render_page_sitemap(scope, sitemapname,statics, machines, lang)
    file = RAILS_ROOT + "/public/"+ sitemapname+"_"+lang+".xml.gz"
    Zlib::GzipWriter.open(file) do |gzip|
        xml = Builder::XmlMarkup.new(:target => gzip)
        xml.instruct!
        xml.urlset("xmlns:xsi" => 'http://www.w3.org/2001/XMLSchema-instance', "xsi:schemaLocation" =>"http://www.sitemaps.org/schemas/sitemap/0.9 http://www.sitemaps.org/schemas/sitemap/0.9/sitemap.xsd", "xmlns" => "http://www.sitemaps.org/schemas/sitemap/0.9") do
            statics.each do |page|
                xml.url do
                    xml.loc(url_for(:only_path => false, :controller => "homepages", :action => page, :lang => lang, :scope => scope))
                    xml.lastmod(Time.now.strftime('%Y-%m-%d'))
                    xml.changefreq("daily")
                end
            end

            machines.each do |mach|
                xml.url do
                    xml.loc(url_for(:only_path => false, :controller => "homepages", :action => 'showmach', :id => mach, :lang => lang, :scope => scope))
                    xml.lastmod(DateTime.strptime(mach.created, '%Y-%m-%d %H:%M:%S').strftime("%Y-%m-%dT%H:%M:%S+00:00"))
                    xml.changefreq("monthly")
                end
            end
        end #XML
        gzip.close
    end #GZIP
end

def render_overall_sitemap(sitemapname, baseurl)
    file = RAILS_ROOT + "/public/overall_"+sitemapname+".xml.gz"
    Zlib::GzipWriter.open(file) do |gzip|
        xml = Builder::XmlMarkup.new(:target => gzip)
        xml.instruct!
        xml.sitemapindex("xmlns:xsi" => 'http://www.w3.org/2001/XMLSchema-instance', "xsi:schemaLocation" =>"http://www.sitemaps.org/schemas/sitemap/0.9 http://www.sitemaps.org/schemas/sitemap/0.9/siteindex.xsd", "xmlns" => "http://www.sitemaps.org/schemas/sitemap/0.9") do
            xml.sitemap do
                xml.loc("http://#{baseurl}/#{sitemapname}_de.xml.gz")
                xml.lastmod(Time.now.strftime('%Y-%m-%d'))
            end
            xml.sitemap do
                xml.loc("http://#{baseurl}/#{sitemapname}_en.xml.gz")
                xml.lastmod(Time.now.strftime('%Y-%m-%d'))
            end
            xml.sitemap do
                xml.loc("http://#{baseurl}/#{sitemapname}_zh.xml.gz")
                xml.lastmod(Time.now.strftime('%Y-%m-%d'))
            end
            xml.sitemap do
                xml.loc("http://#{baseurl}/#{sitemapname}_ru.xml.gz")
                xml.lastmod(Time.now.strftime('%Y-%m-%d'))
            end
        end #XML
        gzip.close
    end #GZIP
end




task :generate_sitemap => :environment do
    include ActionController::UrlWriter  # for url_for

    # IMTM Sitemaps
    default_url_options[:host] = 'www.imtm.com'
    staticpages = Array.new
    staticpages << "index"
    staticpages << "impressum"
    staticpages << "kontakt"
    staticpages << "wir"

    machines = Machine.find(:all, :select => "machines.id, machines.created, model, manufacturer_id", :include => [:manufacturer])
    render_page_sitemap("imtm", "imtm_sitemap",staticpages, machines, "de")
    render_page_sitemap("imtm", "imtm_sitemap",staticpages, machines, "en")
    render_page_sitemap("imtm", "imtm_sitemap",staticpages, machines, "zh")
    render_page_sitemap("imtm", "imtm_sitemap",staticpages, machines, "ru")
    
    render_overall_sitemap("imtm_sitemap", "www.imtm.com")

    # globus Sitemaps
    default_url_options[:host] = 'www.globus-trading.com'
    staticpages = Array.new
    staticpages << "index"
    staticpages << "anfahrt"
    staticpages << "anfrage"
    staticpages << "downloads"
    staticpages << "impressum"
    staticpages << "kontakt"
    staticpages << "produktionslinien"
    staticpages << "wir"

    machines = Machine.find(:all, :select => "machines.id, machines.created, model, manufacturer_id", :conditions => ["globushp = 1 OR globusspecial = 1 OR globusstart = 1 OR globusstartspecial = 1"], :include => [:manufacturer])
    render_page_sitemap("globus", "globus_sitemap",staticpages, machines, "de")
    render_page_sitemap("globus", "globus_sitemap",staticpages, machines, "en")
    render_page_sitemap("globus", "globus_sitemap",staticpages, machines, "zh")
    render_page_sitemap("globus", "globus_sitemap",staticpages, machines, "ru")
    
    render_overall_sitemap("globus_sitemap", "www.globus-trading.com")
 
end

Refactorings

No refactoring yet !

F9a9ba6663645458aa8630157ed5e71e

Ants

February 15, 2010, February 15, 2010 09:01, permalink

No rating. Login to rate!

In the cluster of calls to render_page_sitemap() it looks like the only thing that varies is the lang parameter. The lang parameter is only used to build up an output filename, as well as the URLs. Why not call the function once and have it generate its output with place holders, and then just iterate later to replace the placeholders with the appropriate language?

Also, the xml.lastmod(...) computations are done each time for every URL, probably can be computed just once if you don't mind a little bit of inaccuracy that could be off by just a few minutes.

175ab087b0647367da3d370fbf5bc024

tiekuhn

February 23, 2010, February 23, 2010 23:02, permalink

No rating. Login to rate!

Hi Ants,

you definitly gave me the right hints :-). I finally could isolate the "slow part" of the whole script!
somehow the generation of the "lastmod" timestamps based on my DB records took an awfull lot of time!
So I replaced this part with a string based alternative --> run time of the script decreased from > 10 min to ~1-2 min!!!

for the changed code, see below:

thanks again very much for your help!

SLOOOOOOW:
xml.lastmod(DateTime.strptime(mach.created, '%Y-%m-%d %H:%M:%S').strftime("%Y-%m-%dT%H:%M:%S+00:00"))

MUCH faster:
xml.lastmod("#{mach.created.tr(' ','T')}+00:00")
175ab087b0647367da3d370fbf5bc024

tiekuhn

February 23, 2010, February 23, 2010 23:03, permalink

No rating. Login to rate!

Hi Ants,

you definitly gave me the right hints :-). I finally could isolate the "slow part" of the whole script!
somehow the generation of the "lastmod" timestamps based on my DB records took an awfull lot of time!
So I replaced this part with a string based alternative --> run time of the script decreased from > 10 min to ~1-2 min!!!

for the changed code, see below:

thanks again very much for your help!

SLOOOOOOW:
xml.lastmod(DateTime.strptime(mach.created, '%Y-%m-%d %H:%M:%S').strftime("%Y-%m-%dT%H:%M:%S+00:00"))

MUCH faster:
xml.lastmod("#{mach.created.tr(' ','T')}+00:00")
175ab087b0647367da3d370fbf5bc024

tiekuhn

February 23, 2010, February 23, 2010 23:55, permalink

No rating. Login to rate!

to speed up things even more I rewrote the rake task in perl
--> run time < 1 sec!!!

but beware....i'm a newby in perl --> code is really ugly and could need lots of improvement :-) so if you want to help....you're more then welcome

#!/usr/bin/perl

use Mysql;

# MYSQL CONFIG VARIABLES
$host = "localhost:3306";
$database = "somedb";
$user = "someuser";
$pw = "somepw";

# PERL MYSQL CONNECT()
$connect = Mysql->connect($host, $database, $user, $pw);

# SELECT DB
$connect->selectdb($database);

# DEFINE A VARIABLES
$imtmquery = "SELECT machines.id, manufacturers.name, machines.model, machines.created FROM machines LEFT JOIN manufacturers ON machines.manufacturer_id = manufacturers.id";
@imtmstatics = ("blaa", "blubb");
$globusquery = "SELECT machines.id, manufacturers.name, machines.model, machines.created FROM machines LEFT JOIN manufacturers ON machines.manufacturer_id = manufacturers.id WHERE globushp = 1 OR globusspecial = 1 OR globusstart = 1 OR globusstartspecial = 1";
@globusstatics = ("blaa", "blubb");

$xmlheader = '<?xml version="1.0" encoding="UTF-8" ?><urlset xsi:schemaLocation="http://www.sitemaps.org/schemas/sitemap/0.9 http://www.sitemaps.org/schemas/sitemap/0.9/sitemap.xsd" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">';
$xmlfooter = '</urlset>';

sub print_machine_link
{
	return	"<url>
			<loc>http://www.".$_[0]."/".$_[5]."/showmach/".$_[1]."-".$_[2]."-".$_[3]."</loc> 
			<lastmod>".$_[4]."+00:00</lastmod> 
			<changefreq>monthly</changefreq> 
		</url>";
}

sub print_page_link
{
	
	return	"<url>
			<loc>http://www.".$_[0]."/".$_[1]."/".$_[2]."</loc> 
			<lastmod></lastmod> 
			<changefreq>monthly</changefreq> 
		</url>";
}

open (IMTMDE, '>imtm_sitemap_de.xml');
open (IMTMEN, '>imtm_sitemap_en.xml');
open (IMTMZH, '>imtm_sitemap_zh.xml');
open (IMTMRU, '>imtm_sitemap_ru.xml');
print IMTMDE $xmlheader;
print IMTMEN $xmlheader;
print IMTMZH $xmlheader;
print IMTMRU $xmlheader;

# RENDER STATIC IMTM PAGES
for ($i=0;$i<@imtmstatics;$i++){
	print IMTMDE print_page_link("imtm.com", "de", $imtmstatics[$i]);
	print IMTMEN print_page_link("imtm.com", "en", $imtmstatics[$i]);
	print IMTMZH print_page_link("imtm.com", "zh", $imtmstatics[$i]);
	print IMTMRU print_page_link("imtm.com", "ru", $imtmstatics[$i]);
}

# EXECUTE IMTM QUERY
$execute = $connect->query($imtmquery);

while (@results = $execute->fetchrow()) {
	$manufacturer = lc($results[1]);
	$model = lc($results[2]);
	$timestamp = $results[3];
	$manufacturer =~ tr/A-Za-z_0-9/-/c;
	$model =~ tr/A-Za-z_0-9/-/c;
	$timestamp =~ tr/ /T/;
	print IMTMDE print_machine_link("imtm.com", $results[0], $manufacturer, $model, $timestamp, "de");
	print IMTMEN print_machine_link("imtm.com", $results[0], $manufacturer, $model, $timestamp, "en");
	print IMTMZH print_machine_link("imtm.com", $results[0], $manufacturer, $model, $timestamp, "zh");
	print IMTMRU print_machine_link("imtm.com", $results[0], $manufacturer, $model, $timestamp, "ru");
}

print IMTMDE $xmlfooter;
print IMTMEN $xmlfooter;
print IMTMZH $xmlfooter;
print IMTMRU $xmlfooter;

close (IMTMDE);
close (IMTMEN);
close (IMTMZH);
close (IMTMRU);






open (GLOBUSDE, '>globus_sitemap_de.xml');
open (GLOBUSEN, '>globus_sitemap_en.xml');
open (GLOBUSZH, '>globus_sitemap_zh.xml');
open (GLOBUSRU, '>globus_sitemap_ru.xml');
print GLOBUSDE $xmlheader;
print GLOBUSEN $xmlheader;
print GLOBUSZH $xmlheader;
print GLOBUSRU $xmlheader;

# RENDER STATIC Globus PAGES
for ($i=0;$i<@globusstatics;$i++){
	print GLOBUSDE print_page_link("globus-trading.com", "de", $globusstatics[$i]);
	print GLOBUSEN print_page_link("globus-trading.com", "en", $globusstatics[$i]);
	print GLOBUSZH print_page_link("globus-trading.com", "zh", $globusstatics[$i]);
	print GLOBUSRU print_page_link("globus-trading.com", "ru", $globusstatics[$i]);
}

# EXECUTE Globus QUERY
$execute = $connect->query($globusquery);

while (@results = $execute->fetchrow()) {
	$manufacturer = lc($results[1]);
	$model = lc($results[2]);
	$manufacturer =~ tr/A-Za-z_0-9/-/c;
	$model =~ tr/A-Za-z_0-9/-/c;
	$timestamp = $results[3];
	$timestamp =~ tr/ /T/;
	print GLOBUSDE print_machine_link("globus-trading.com", $results[0], $manufacturer, $model, $timestamp, "de");
	print GLOBUSEN print_machine_link("globus-trading.com", $results[0], $manufacturer, $model, $timestamp, "en");
	print GLOBUSZH print_machine_link("globus-trading.com", $results[0], $manufacturer, $model, $timestamp, "zh");
	print GLOBUSRU print_machine_link("globus-trading.com", $results[0], $manufacturer, $model, $timestamp, "ru");
}
print GLOBUSDE $xmlfooter;
print GLOBUSEN $xmlfooter;
print GLOBUSZH $xmlfooter;
print GLOBUSRU $xmlfooter;

close (GLOBUSDE);
close (GLOBUSEN);
close (GLOBUSZH);
close (GLOBUSRU);

system "gzip imtm_sitemap_de.xml";
system "gzip imtm_sitemap_en.xml";
system "gzip imtm_sitemap_zh.xml";
system "gzip imtm_sitemap_ru.xml";

system "gzip globus_sitemap_de.xml";
system "gzip globus_sitemap_en.xml";
system "gzip globus_sitemap_zh.xml";
system "gzip globus_sitemap_ru.xml";

Your refactoring





Format Copy from initial code

or Cancel