55502f40dc8b7c769880b10874abc9d0

I'm trying to create a method that provides "best effort" parsing of decimal inputs in cases where I do not know which of these two mutually exclusive ways of writing numbers the end-user is using:

* "." as thousands separator and "," as decimal separator
* "," as thousands separator and "." as decimal separator

The method is implemented as parse_decimal(..) in the code below. Furthermore, I've defined 20 test cases that show how the heuristics of the method should work.

While the code below passes the tests it is quite horrible and unreadable. I'm sure there is a more compact and readable way to implement the method. Possibly including smarter use of regexpes.

#!/usr/bin/perl -wT

use strict;
use warnings;
use Test::More tests => 20;

ok(&parse_decimal("1,234,567") == 1234567);
ok(&parse_decimal("1,234567") == 1.234567);
ok(&parse_decimal("1.234.567") == 1234567);
ok(&parse_decimal("1.234567") == 1.234567);
ok(&parse_decimal("12,345") == 12345);
ok(&parse_decimal("12,345,678") == 12345678);
ok(&parse_decimal("12,345.67") == 12345.67);
ok(&parse_decimal("12,34567") == 12.34567);
ok(&parse_decimal("12.34") == 12.34);
ok(&parse_decimal("12.345") == 12345);
ok(&parse_decimal("12.345,67") == 12345.67);
ok(&parse_decimal("12.345.678") == 12345678);
ok(&parse_decimal("12.34567") == 12.34567);
ok(&parse_decimal("123,4567") == 123.4567);
ok(&parse_decimal("123.4567") == 123.4567);
ok(&parse_decimal("1234,567") == 1234.567);
ok(&parse_decimal("1234.567") == 1234.567);
ok(&parse_decimal("12345") == 12345);
ok(&parse_decimal("12345,67") == 12345.67);
ok(&parse_decimal("1234567") == 1234567);

sub parse_decimal($) {
    my $input = shift;
    $input =~ s/[^\d,\.]//g;
    if ($input !~ /[,\.]/) {
        return &parse_with_separators($input, '.', ',');
    } elsif ($input =~ /\d,\d+\.\d/) {
        return &parse_with_separators($input, '.', ',');
    } elsif ($input =~ /\d\.\d+,\d/) {
        return &parse_with_separators($input, ',', '.');
    } elsif ($input =~ /\d\.\d+\.\d/) {
        return &parse_with_separators($input, ',', '.');
    } elsif ($input =~ /\d,\d+,\d/) {
        return &parse_with_separators($input, '.', ',');
    } elsif ($input =~ /\d{4},\d/) {
        return &parse_with_separators($input, ',', '.');
    } elsif ($input =~ /\d{4}\.\d/) {
        return &parse_with_separators($input, '.', ',');
    } elsif ($input =~ /\d,\d{3}$/) {
        return &parse_with_separators($input, '.', ',');
    } elsif ($input =~ /\d\.\d{3}$/) {
        return &parse_with_separators($input, ',', '.');
    } elsif ($input =~ /\d,\d/) {
        return &parse_with_separators($input, ',', '.');
    } elsif ($input =~ /\d\.\d/) {
        return &parse_with_separators($input, '.', ',');
    } else {
        return &parse_with_separators($input, '.', ',');
    }
}

sub parse_with_separators($$$) {
    my $input = shift;
    my $decimal_separator = shift;
    my $thousand_separator = shift;
    my $output = $input;
    $output =~ s/\Q${thousand_separator}\E//g;
    $output =~ s/\Q${decimal_separator}\E/./g;
    return $output;
}

Refactorings

No refactoring yet !

35cb5954a17f0e0c0a819bcaad5ea424

Ruben Wisniewski

September 28, 2009, September 28, 2009 20:00, permalink

No rating. Login to rate!

I hope I added my patterns right in you're code, sorry if there are bugs, but I can't write perl.

#!/usr/bin/perl -wT

use strict;
use warnings;
use Test::More tests => 20;

ok(&parse_decimal("1,234,567") == 1234567);
ok(&parse_decimal("1,234567") == 1.234567);
ok(&parse_decimal("1.234.567") == 1234567);
ok(&parse_decimal("1.234567") == 1.234567);
ok(&parse_decimal("12,345") == 12345);
ok(&parse_decimal("12,345,678") == 12345678);
ok(&parse_decimal("12,345.67") == 12345.67);
ok(&parse_decimal("12,34567") == 12.34567);
ok(&parse_decimal("12.34") == 12.34);
ok(&parse_decimal("12.345") == 12345);
ok(&parse_decimal("12.345,67") == 12345.67);
ok(&parse_decimal("12.345.678") == 12345678);
ok(&parse_decimal("12.34567") == 12.34567);
ok(&parse_decimal("123,4567") == 123.4567);
ok(&parse_decimal("123.4567") == 123.4567);
ok(&parse_decimal("1234,567") == 1234.567);
ok(&parse_decimal("1234.567") == 1234.567);
ok(&parse_decimal("12345") == 12345);
ok(&parse_decimal("12345,67") == 12345.67);
ok(&parse_decimal("1234567") == 1234567);

sub parse_decimal($) {
    my $input = shift;
    $input =~ s/[^\d,\.]//g;
    if ($input !~ /[,\.]/) {
        return &parse_with_separators($input, '.', ',');
    } elsif ($input =~ /^[-+]{0,1}(\d{1,3}[.]){0,1}(\d{3}[.]){0,}\d{3}([,]\d{0,}){0,1}$/) {
        return &parse_with_separators($input, '.', ',');
    } elsif ($input =~ /^[-+]{0,1}\d{0,}[.]\d{0,}$/) {
        return &parse_with_separators($input, '.', ',');
    } elsif ($input =~ /^[-+]{0,1}\d{0,}[,]\d{0,}$/) {
        return &parse_with_separators($input, ',', '.');
    } elsif ($input =~ /^[-+]{0,1}(\d{1,3}[,]){0,1}(\d{3}[,]){0,}\d{3}([.]\d{0,}){0,1}$/) {
        return &parse_with_separators($input, ',', '.');
    } else {
        return &parse_with_separators($input, '.', ',');
    }
}

sub parse_with_separators($$$) {
    my $input = shift;
    my $decimal_separator = shift;
    my $thousand_separator = shift;
    my $output = $input;
    $output =~ s/\Q${thousand_separator}\E//g;
    $output =~ s/\Q${decimal_separator}\E/./g;
    return $output;
}
D85d44a0eca045f40e5a31449277c26c

Ben Marini

October 3, 2009, October 03, 2009 02:56, permalink

No rating. Login to rate!

Seems like all of these test cases can be satisfied with three cases. I didn't write out the reg exps needed to match these cases but you get the idea...

# Case 1: {comma,period} followed by exactly 3 digits and no occurrence of 4+ digits in a row
# Case 2: {comma,period} followed by exactly N digits where N != 3
# Case 3: No commas or periods

ok(&parse_decimal("1,234,567") == 1234567);   # , 3 digits   -> separator
ok(&parse_decimal("1,234567") == 1.234567);   # , ! 3 digits -> decimal
ok(&parse_decimal("1.234.567") == 1234567);   # . 3 digits   -> separator
ok(&parse_decimal("1.234567") == 1.234567);   # . ! 3 digits -> decimal
ok(&parse_decimal("12,345") == 12345);        # , 3 digits   -> separator
ok(&parse_decimal("12,345,678") == 12345678); # , 3 digits   -> separator
ok(&parse_decimal("12,345.67") == 12345.67);  # , 3 digits   -> separator
ok(&parse_decimal("12,34567") == 12.34567);   # , ! 3 digits -> decimal
ok(&parse_decimal("12.34") == 12.34);         # . ! 3 digits -> decimal
ok(&parse_decimal("12.345") == 12345);        # . 3 digits   -> separator
ok(&parse_decimal("12.345,67") == 12345.67);  # . 3 digits   -> separator
ok(&parse_decimal("12.345.678") == 12345678); # . 3 digits   -> separator
ok(&parse_decimal("12.34567") == 12.34567);   # . ! 3 digits -> decimal
ok(&parse_decimal("123,4567") == 123.4567);   # , ! 3 digits -> decimal
ok(&parse_decimal("123.4567") == 123.4567);   # . ! 3 digits -> decimal
ok(&parse_decimal("1234,567") == 1234.567);   # , 3 digits BUT 4 digits w no separator -> decimal
ok(&parse_decimal("1234.567") == 1234.567);   # . 3 digits BUT 4 digits w no separator -> decimal
ok(&parse_decimal("12345") == 12345);         # No , or .
ok(&parse_decimal("12345,67") == 12345.67);   # , ! 3 digits -> decimal
ok(&parse_decimal("1234567") == 1234567);     # No , or .
29cb106071d163d703484e63839d89cb

draegtun

October 9, 2009, October 09, 2009 21:08, permalink

No rating. Login to rate!

I'm not going to refactor your algorithm because previous two comments cover some ideas/methods on how this maybe achieved.

However I will point out some "style" refactoring which will make your program more DRY.

Key things to note:

1) Test data is held in a hash.
2) No need for & prefix on subs
3) Also no need (at least is this test example) for sub prototypes
4) And rather than if/elsif you can short circuit parse_decimal like in my refactor (though there are quite a few other ways to improve this also).

/I3az/

use strict;
use warnings;
use Test::More;

my %tests = (
    "1,234,567"  => 1234567,    "1,234567"   => 1.234567,
    "1.234.567"  => 1234567,    "1.234567"   => 1.234567,
    "12,345"     => 12345,      "12,345,678" => 12345678,
    "12,345.67"  => 12345.67,   "12,34567"   => 12.34567,
    "12.34"      => 12.34,      "12.345"     => 12345,
    "12.345,67"  => 12345.67,   "12.345.678" => 12345678,
    "12.34567"   => 12.34567,   "123,4567"   => 123.4567,
    "123.4567"   => 123.4567,   "1234,567"   => 1234.567,
    "1234.567"   => 1234.567,   "12345"      => 12345,
    "12345,67"   => 12345.67,   "1234567"    => 1234567,
);

plan tests => scalar keys %tests;
ok parse_decimal( $_ ) == $tests{ $_ } for keys %tests;

sub parse_decimal {
    my $input = shift;
    $input =~ s/[^\d,\.]//g;
    
    my @dot_comma = ( $input, '.', ',' );
    my @comma_dot = ( $input, ',', '.' );
    
    return parse_with_separators( @dot_comma ) if $input !~ /[,\.]/;
    return parse_with_separators( @dot_comma ) if $input =~ /\d,\d+\.\d/;
    return parse_with_separators( @comma_dot ) if $input =~ /\d\.\d+,\d/; 
    return parse_with_separators( @comma_dot ) if $input =~ /\d\.\d+\.\d/;
    return parse_with_separators( @dot_comma ) if $input =~ /\d,\d+,\d/;
    return parse_with_separators( @comma_dot ) if $input =~ /\d{4},\d/;
    return parse_with_separators( @dot_comma ) if $input =~ /\d{4}\.\d/;
    return parse_with_separators( @dot_comma ) if $input =~ /\d,\d{3}$/;
    return parse_with_separators( @comma_dot ) if $input =~ /\d\.\d{3}$/;
    return parse_with_separators( @comma_dot ) if $input =~ /\d,\d/;
    return parse_with_separators( @dot_comma ) if $input =~ /\d\.\d/;
    return parse_with_separators( @dot_comma );
}

sub parse_with_separators {
    my ($input, $decimal_separator, $thousand_separator) = @_;
    my $output = $input;
    $output =~ s/\Q${thousand_separator}\E//g;
    $output =~ s/\Q${decimal_separator}\E/./g;
    return $output;
}
29cb106071d163d703484e63839d89cb

draegtun

October 9, 2009, October 09, 2009 21:13, permalink

No rating. Login to rate!

And here are some other examples of how you could refactor parse_decimal using given/when (need Perl 5.10) with your current algorithm.

Hopefully this will help and also give you some further ideas.

/I3az/

sub parse_decimal {
    my $input = shift;
    $input =~ s/[^\d,\.]//g;
    
    my @dot_comma = ( $input, '.', ',' );
    my @comma_dot = ( $input, ',', '.' );
    
    given ($input) {
        when ( $_ !~ /[,\.]/ )  { return parse_with_separators( @dot_comma ) }
        when ( /\d,\d+\.\d/  )  { return parse_with_separators( @dot_comma ) }
        when ( /\d\.\d+,\d/  )  { return parse_with_separators( @comma_dot ) }
        when ( /\d\.\d+\.\d/ )  { return parse_with_separators( @comma_dot ) }
        when ( /\d,\d+,\d /  )  { return parse_with_separators( @dot_comma ) }
        when ( /\d{4},\d/    )  { return parse_with_separators( @comma_dot ) }
        when ( /\d{4}\.\d/   )  { return parse_with_separators( @dot_comma ) }
        when ( /\d,\d{3}$/   )  { return parse_with_separators( @dot_comma ) }
        when ( /\d\.\d{3}$/  )  { return parse_with_separators( @comma_dot ) }
        when ( /\d,\d/       )  { return parse_with_separators( @comma_dot ) }
        when ( /\d\.\d/      )  { return parse_with_separators( @dot_comma ) }
        default { parse_with_separators( @dot_comma ) }
    }
}
sub parse_decimal {
    my $input = shift;
    $input =~ s/[^\d,\.]//g;
    
    my @dot_comma = ( '.', ',' );
    my @comma_dot = reverse @dot_comma;
    my @use;
    
    given ($input) {
        when ( $_ !~ /[,\.]/ )  { @use = @dot_comma }
        when ( /\d,\d+\.\d/  )  { @use = @dot_comma }
        when ( /\d\.\d+,\d/  )  { @use = @comma_dot }
        when ( /\d\.\d+\.\d/ )  { @use = @comma_dot }
        when ( /\d,\d+,\d /  )  { @use = @dot_comma }
        when ( /\d{4},\d/    )  { @use = @comma_dot }
        when ( /\d{4}\.\d/   )  { @use = @dot_comma }
        when ( /\d,\d{3}$/   )  { @use = @dot_comma }
        when ( /\d\.\d{3}$/  )  { @use = @comma_dot }
        when ( /\d,\d/       )  { @use = @comma_dot }
        when ( /\d\.\d/      )  { @use = @dot_comma }
        default { @use = @dot_comma }
    }
    
    return parse_with_separators( $input, @use );
}
sub parse_decimal {
    return parse_with_separators( sub {
        my $input = shift;
        $input =~ s/[^\d,\.]//g;
        
        my @dot_comma = ( $input, '.', ',' );
        my @comma_dot = ( $input, ',', '.' );
        
        given ($input) {
            when ( $_ !~ /[,\.]/ )  { return @dot_comma }
            when ( /\d,\d+\.\d/  )  { return @dot_comma }
            when ( /\d\.\d+,\d/  )  { return @comma_dot }
            when ( /\d\.\d+\.\d/ )  { return @comma_dot }
            when ( /\d,\d+,\d /  )  { return @dot_comma }
            when ( /\d{4},\d/    )  { return @comma_dot }
            when ( /\d{4}\.\d/   )  { return @dot_comma }
            when ( /\d,\d{3}$/   )  { return @dot_comma }
            when ( /\d\.\d{3}$/  )  { return @comma_dot }
            when ( /\d,\d/       )  { return @comma_dot }
            when ( /\d\.\d/      )  { return @dot_comma }
            default { return @dot_comma }
        }
    }->( $_[0] ));
}
6e8ddfd51613a0bb512abb09b64dafef

johntrammell.myopenid.com

November 16, 2009, November 16, 2009 17:32, permalink

No rating. Login to rate!

Definitely need to get rid of the huge if/else block.

#!/usr/bin/perl

use strict;
use warnings FATAL => 'all';
use Test::More tests => 20;

ok(&parse_decimal("1,234,567") == 1234567);
ok(&parse_decimal("1,234567") == 1.234567);
ok(&parse_decimal("1.234.567") == 1234567);
ok(&parse_decimal("1.234567") == 1.234567);
ok(&parse_decimal("12,345") == 12345);
ok(&parse_decimal("12,345,678") == 12345678);
ok(&parse_decimal("12,345.67") == 12345.67);
ok(&parse_decimal("12,34567") == 12.34567);
ok(&parse_decimal("12.34") == 12.34);
ok(&parse_decimal("12.345") == 12345);
ok(&parse_decimal("12.345,67") == 12345.67);
ok(&parse_decimal("12.345.678") == 12345678);
ok(&parse_decimal("12.34567") == 12.34567);
ok(&parse_decimal("123,4567") == 123.4567);
ok(&parse_decimal("123.4567") == 123.4567);
ok(&parse_decimal("1234,567") == 1234.567);
ok(&parse_decimal("1234.567") == 1234.567);
ok(&parse_decimal("12345") == 12345);
ok(&parse_decimal("12345,67") == 12345.67);
ok(&parse_decimal("1234567") == 1234567);

sub parse_decimal {
    (my $input = shift) =~ s/[^\d,\.]//g;
    my ($pc, $cp) = ([q(.),q(,)],[q(,),q(.)]);
    my @dispatch = (
        [ '\d,\d+\.\d'  => $pc, ],
        [ '\d\.\d+,\d'  => $cp, ],
        [ '\d\.\d+\.\d' => $cp, ],
        [ '\d,\d+,\d'   => $pc, ],
        [ '\d{4},\d'    => $cp, ],
        [ '\d{4}\.\d'   => $pc, ],
        [ '\d,\d{3}$'   => $pc, ],
        [ '\d\.\d{3}$'  => $cp, ],
        [ '\d,\d'       => $cp, ],
        [ '\d.\d'       => $pc, ],
    );
    my @args = @$pc;    # default
    for my $d (@dispatch) {
        next unless $input =~ /$d->[0]/;
        @args = @{ $d->[1] };
        last;
    }
    return parse_with_separators($input, @args);
}

sub parse_with_separators {
    my $input = shift;
    my $decimal_separator = shift;
    my $thousand_separator = shift;
    my $output = $input;
    $output =~ s/\Q${thousand_separator}\E//g;
    $output =~ s/\Q${decimal_separator}\E/./g;
    return $output;
}
34db50b7ce2e115afadf5a765b950739

Thomas Salvador

November 16, 2009, November 16, 2009 20:20, permalink

No rating. Login to rate!

hi.

here an idea on how to simplify this:

cases:
- only ,: if it occurs exactly one time, it is decimal separator, thousands otherwise
- only .: if it occurs exactly one time, it is decimal separator, thousands otherwise
- both , and .:
- the one, that occurs only once is decimal separator, the other one is thousands sep
- if both occur exactly one time each, the rightmost is decimal separator, the other one is thousands sep

...

maybe even simplier:
- the rightmost separator is the decimal separator.
- the second from the right is the thousands separator.
- if both are the same, there is no decimal but there are only thousands separators.

...

hence, imho solution is to determine count and position of the separators.
in the second method it is imho sufficient to get the first two separators from the right.

regards,
thomas.

16ce028e27993156566685b802365873

giftnuss

November 17, 2009, November 17, 2009 10:00, permalink

No rating. Login to rate!

This is a useable implementation of the ideas above.

sub parse_decimal {
    (my $input = shift) =~ s/[^\d,\.]//g;
    my %o = (','=>'.','.'=>',');
    my @sep = $input =~ /[,\.]/g or return $input;

    my ($ds,$ts) = @sep==1
        ? substr($input,-4,1) eq $sep[0] && index($input,$sep[0]) < 4
            ? ($o{$sep[0]},$sep[0]) : ($sep[0],$o{$sep[0]})
        : (($sep[-1] eq $sep[0] ? $o{$sep[0]} : $sep[-1]),$sep[0]);
    parse_with_separators($input,$ds,$ts);
}
C644d3a078aa77de99147d83d8c8de31

Neil Ostrove

January 25, 2010, January 25, 2010 16:28, permalink

No rating. Login to rate!

This gets rid the duplicated ifs. The regex is a bit complicated but I think it's a straightforward description of grouping. The hack to allow null grouping and decimal separators could be removed by splitting parse_grouping into two functions, but I think this is still clear and more DRY/OAOO.

What do you think?

use strict;
use warnings;
use Test::More;

my %tests = (
    "1,234,567"  => 1234567,    "1,234567"   => 1.234567,
    "1.234.567"  => 1234567,    "1.234567"   => 1.234567,
    "12,345"     => 12345,      "12,345,678" => 12345678,
    "12,345.67"  => 12345.67,   "12,34567"   => 12.34567,
    "12.34"      => 12.34,      "12.345"     => 12345,
    "12.345,67"  => 12345.67,   "12.345.678" => 12345678,
    "12.34567"   => 12.34567,   "123,4567"   => 123.4567,
    "123.4567"   => 123.4567,   "1234,567"   => 1234.567,
    "1234.567"   => 1234.567,   "12345"      => 12345,
    "12345,67"   => 12345.67,   "1234567"    => 1234567,
);

plan tests => scalar keys %tests;
is( parse_decimal( $_ ), $tests{ $_ } ) for keys %tests;

sub parse_decimal {
    my $input = shift;
    my $output = parse_grouping($input, ",", "."); # parse as 12,345,67.89
    $output = parse_grouping($output, ".", ",");   # parse as 12.345.67,89
    $output = parse_grouping($output, "", ",");    # parse as 1234567,89

    return $output
}

sub parse_grouping {
    my ($number, $grp, $dec) = @_;
    $grp = "[$grp]" if $grp;     # make regex character class
    $dec = "[$dec]" if $dec;     # make regex character class
    my $parsed = $number;

    if ( $parsed =~ m/^\d{1,3}($grp\d{3})*($dec\d*)?$/ ) {
        $parsed =~ s/$grp//g if $grp;
        $parsed =~ s/$dec/./g if $dec;
    }
    return $parsed
}
D41d8cd98f00b204e9800998ecf8427e

Neil Ostrove

January 25, 2010, January 25, 2010 16:35, permalink

No rating. Login to rate!

Natuarally I messed up the comments (which can't be TDDed). See below.

sub parse_decimal {
    my $input = shift;
    my $output = parse_grouping($input, ",", "."); # parse as 12,345,678.90
    $output = parse_grouping($output, ".", ",");   # parse as 12.345.678,90
    $output = parse_grouping($output, "", ",");    # parse as 1234567,890

    return $output
}

Your refactoring





Format Copy from initial code

or Cancel