#!/usr/bin/perl -wT
use strict;
use warnings;
use Test::More tests => 20;
ok(&parse_decimal("1,234,567") == 1234567);
ok(&parse_decimal("1,234567") == 1.234567);
ok(&parse_decimal("1.234.567") == 1234567);
ok(&parse_decimal("1.234567") == 1.234567);
ok(&parse_decimal("12,345") == 12345);
ok(&parse_decimal("12,345,678") == 12345678);
ok(&parse_decimal("12,345.67") == 12345.67);
ok(&parse_decimal("12,34567") == 12.34567);
ok(&parse_decimal("12.34") == 12.34);
ok(&parse_decimal("12.345") == 12345);
ok(&parse_decimal("12.345,67") == 12345.67);
ok(&parse_decimal("12.345.678") == 12345678);
ok(&parse_decimal("12.34567") == 12.34567);
ok(&parse_decimal("123,4567") == 123.4567);
ok(&parse_decimal("123.4567") == 123.4567);
ok(&parse_decimal("1234,567") == 1234.567);
ok(&parse_decimal("1234.567") == 1234.567);
ok(&parse_decimal("12345") == 12345);
ok(&parse_decimal("12345,67") == 12345.67);
ok(&parse_decimal("1234567") == 1234567);
sub parse_decimal($) {
my $input = shift;
$input =~ s/[^\d,\.]//g;
if ($input !~ /[,\.]/) {
return &parse_with_separators($input, '.', ',');
} elsif ($input =~ /\d,\d+\.\d/) {
return &parse_with_separators($input, '.', ',');
} elsif ($input =~ /\d\.\d+,\d/) {
return &parse_with_separators($input, ',', '.');
} elsif ($input =~ /\d\.\d+\.\d/) {
return &parse_with_separators($input, ',', '.');
} elsif ($input =~ /\d,\d+,\d/) {
return &parse_with_separators($input, '.', ',');
} elsif ($input =~ /\d{4},\d/) {
return &parse_with_separators($input, ',', '.');
} elsif ($input =~ /\d{4}\.\d/) {
return &parse_with_separators($input, '.', ',');
} elsif ($input =~ /\d,\d{3}$/) {
return &parse_with_separators($input, '.', ',');
} elsif ($input =~ /\d\.\d{3}$/) {
return &parse_with_separators($input, ',', '.');
} elsif ($input =~ /\d,\d/) {
return &parse_with_separators($input, ',', '.');
} elsif ($input =~ /\d\.\d/) {
return &parse_with_separators($input, '.', ',');
} else {
return &parse_with_separators($input, '.', ',');
}
}
sub parse_with_separators($$$) {
my $input = shift;
my $decimal_separator = shift;
my $thousand_separator = shift;
my $output = $input;
$output =~ s/\Q${thousand_separator}\E//g;
$output =~ s/\Q${decimal_separator}\E/./g;
return $output;
}
Refactorings
No refactoring yet !
Ruben Wisniewski
September 28, 2009, September 28, 2009 20:00, permalink
I hope I added my patterns right in you're code, sorry if there are bugs, but I can't write perl.
#!/usr/bin/perl -wT
use strict;
use warnings;
use Test::More tests => 20;
ok(&parse_decimal("1,234,567") == 1234567);
ok(&parse_decimal("1,234567") == 1.234567);
ok(&parse_decimal("1.234.567") == 1234567);
ok(&parse_decimal("1.234567") == 1.234567);
ok(&parse_decimal("12,345") == 12345);
ok(&parse_decimal("12,345,678") == 12345678);
ok(&parse_decimal("12,345.67") == 12345.67);
ok(&parse_decimal("12,34567") == 12.34567);
ok(&parse_decimal("12.34") == 12.34);
ok(&parse_decimal("12.345") == 12345);
ok(&parse_decimal("12.345,67") == 12345.67);
ok(&parse_decimal("12.345.678") == 12345678);
ok(&parse_decimal("12.34567") == 12.34567);
ok(&parse_decimal("123,4567") == 123.4567);
ok(&parse_decimal("123.4567") == 123.4567);
ok(&parse_decimal("1234,567") == 1234.567);
ok(&parse_decimal("1234.567") == 1234.567);
ok(&parse_decimal("12345") == 12345);
ok(&parse_decimal("12345,67") == 12345.67);
ok(&parse_decimal("1234567") == 1234567);
sub parse_decimal($) {
my $input = shift;
$input =~ s/[^\d,\.]//g;
if ($input !~ /[,\.]/) {
return &parse_with_separators($input, '.', ',');
} elsif ($input =~ /^[-+]{0,1}(\d{1,3}[.]){0,1}(\d{3}[.]){0,}\d{3}([,]\d{0,}){0,1}$/) {
return &parse_with_separators($input, '.', ',');
} elsif ($input =~ /^[-+]{0,1}\d{0,}[.]\d{0,}$/) {
return &parse_with_separators($input, '.', ',');
} elsif ($input =~ /^[-+]{0,1}\d{0,}[,]\d{0,}$/) {
return &parse_with_separators($input, ',', '.');
} elsif ($input =~ /^[-+]{0,1}(\d{1,3}[,]){0,1}(\d{3}[,]){0,}\d{3}([.]\d{0,}){0,1}$/) {
return &parse_with_separators($input, ',', '.');
} else {
return &parse_with_separators($input, '.', ',');
}
}
sub parse_with_separators($$$) {
my $input = shift;
my $decimal_separator = shift;
my $thousand_separator = shift;
my $output = $input;
$output =~ s/\Q${thousand_separator}\E//g;
$output =~ s/\Q${decimal_separator}\E/./g;
return $output;
}
Ben Marini
October 3, 2009, October 03, 2009 02:56, permalink
Seems like all of these test cases can be satisfied with three cases. I didn't write out the reg exps needed to match these cases but you get the idea...
# Case 1: {comma,period} followed by exactly 3 digits and no occurrence of 4+ digits in a row
# Case 2: {comma,period} followed by exactly N digits where N != 3
# Case 3: No commas or periods
ok(&parse_decimal("1,234,567") == 1234567); # , 3 digits -> separator
ok(&parse_decimal("1,234567") == 1.234567); # , ! 3 digits -> decimal
ok(&parse_decimal("1.234.567") == 1234567); # . 3 digits -> separator
ok(&parse_decimal("1.234567") == 1.234567); # . ! 3 digits -> decimal
ok(&parse_decimal("12,345") == 12345); # , 3 digits -> separator
ok(&parse_decimal("12,345,678") == 12345678); # , 3 digits -> separator
ok(&parse_decimal("12,345.67") == 12345.67); # , 3 digits -> separator
ok(&parse_decimal("12,34567") == 12.34567); # , ! 3 digits -> decimal
ok(&parse_decimal("12.34") == 12.34); # . ! 3 digits -> decimal
ok(&parse_decimal("12.345") == 12345); # . 3 digits -> separator
ok(&parse_decimal("12.345,67") == 12345.67); # . 3 digits -> separator
ok(&parse_decimal("12.345.678") == 12345678); # . 3 digits -> separator
ok(&parse_decimal("12.34567") == 12.34567); # . ! 3 digits -> decimal
ok(&parse_decimal("123,4567") == 123.4567); # , ! 3 digits -> decimal
ok(&parse_decimal("123.4567") == 123.4567); # . ! 3 digits -> decimal
ok(&parse_decimal("1234,567") == 1234.567); # , 3 digits BUT 4 digits w no separator -> decimal
ok(&parse_decimal("1234.567") == 1234.567); # . 3 digits BUT 4 digits w no separator -> decimal
ok(&parse_decimal("12345") == 12345); # No , or .
ok(&parse_decimal("12345,67") == 12345.67); # , ! 3 digits -> decimal
ok(&parse_decimal("1234567") == 1234567); # No , or .
draegtun
October 9, 2009, October 09, 2009 21:08, permalink
I'm not going to refactor your algorithm because previous two comments cover some ideas/methods on how this maybe achieved.
However I will point out some "style" refactoring which will make your program more DRY.
Key things to note:
1) Test data is held in a hash.
2) No need for & prefix on subs
3) Also no need (at least is this test example) for sub prototypes
4) And rather than if/elsif you can short circuit parse_decimal like in my refactor (though there are quite a few other ways to improve this also).
/I3az/
use strict;
use warnings;
use Test::More;
my %tests = (
"1,234,567" => 1234567, "1,234567" => 1.234567,
"1.234.567" => 1234567, "1.234567" => 1.234567,
"12,345" => 12345, "12,345,678" => 12345678,
"12,345.67" => 12345.67, "12,34567" => 12.34567,
"12.34" => 12.34, "12.345" => 12345,
"12.345,67" => 12345.67, "12.345.678" => 12345678,
"12.34567" => 12.34567, "123,4567" => 123.4567,
"123.4567" => 123.4567, "1234,567" => 1234.567,
"1234.567" => 1234.567, "12345" => 12345,
"12345,67" => 12345.67, "1234567" => 1234567,
);
plan tests => scalar keys %tests;
ok parse_decimal( $_ ) == $tests{ $_ } for keys %tests;
sub parse_decimal {
my $input = shift;
$input =~ s/[^\d,\.]//g;
my @dot_comma = ( $input, '.', ',' );
my @comma_dot = ( $input, ',', '.' );
return parse_with_separators( @dot_comma ) if $input !~ /[,\.]/;
return parse_with_separators( @dot_comma ) if $input =~ /\d,\d+\.\d/;
return parse_with_separators( @comma_dot ) if $input =~ /\d\.\d+,\d/;
return parse_with_separators( @comma_dot ) if $input =~ /\d\.\d+\.\d/;
return parse_with_separators( @dot_comma ) if $input =~ /\d,\d+,\d/;
return parse_with_separators( @comma_dot ) if $input =~ /\d{4},\d/;
return parse_with_separators( @dot_comma ) if $input =~ /\d{4}\.\d/;
return parse_with_separators( @dot_comma ) if $input =~ /\d,\d{3}$/;
return parse_with_separators( @comma_dot ) if $input =~ /\d\.\d{3}$/;
return parse_with_separators( @comma_dot ) if $input =~ /\d,\d/;
return parse_with_separators( @dot_comma ) if $input =~ /\d\.\d/;
return parse_with_separators( @dot_comma );
}
sub parse_with_separators {
my ($input, $decimal_separator, $thousand_separator) = @_;
my $output = $input;
$output =~ s/\Q${thousand_separator}\E//g;
$output =~ s/\Q${decimal_separator}\E/./g;
return $output;
}
draegtun
October 9, 2009, October 09, 2009 21:13, permalink
And here are some other examples of how you could refactor parse_decimal using given/when (need Perl 5.10) with your current algorithm.
Hopefully this will help and also give you some further ideas.
/I3az/
sub parse_decimal {
my $input = shift;
$input =~ s/[^\d,\.]//g;
my @dot_comma = ( $input, '.', ',' );
my @comma_dot = ( $input, ',', '.' );
given ($input) {
when ( $_ !~ /[,\.]/ ) { return parse_with_separators( @dot_comma ) }
when ( /\d,\d+\.\d/ ) { return parse_with_separators( @dot_comma ) }
when ( /\d\.\d+,\d/ ) { return parse_with_separators( @comma_dot ) }
when ( /\d\.\d+\.\d/ ) { return parse_with_separators( @comma_dot ) }
when ( /\d,\d+,\d / ) { return parse_with_separators( @dot_comma ) }
when ( /\d{4},\d/ ) { return parse_with_separators( @comma_dot ) }
when ( /\d{4}\.\d/ ) { return parse_with_separators( @dot_comma ) }
when ( /\d,\d{3}$/ ) { return parse_with_separators( @dot_comma ) }
when ( /\d\.\d{3}$/ ) { return parse_with_separators( @comma_dot ) }
when ( /\d,\d/ ) { return parse_with_separators( @comma_dot ) }
when ( /\d\.\d/ ) { return parse_with_separators( @dot_comma ) }
default { parse_with_separators( @dot_comma ) }
}
}
sub parse_decimal {
my $input = shift;
$input =~ s/[^\d,\.]//g;
my @dot_comma = ( '.', ',' );
my @comma_dot = reverse @dot_comma;
my @use;
given ($input) {
when ( $_ !~ /[,\.]/ ) { @use = @dot_comma }
when ( /\d,\d+\.\d/ ) { @use = @dot_comma }
when ( /\d\.\d+,\d/ ) { @use = @comma_dot }
when ( /\d\.\d+\.\d/ ) { @use = @comma_dot }
when ( /\d,\d+,\d / ) { @use = @dot_comma }
when ( /\d{4},\d/ ) { @use = @comma_dot }
when ( /\d{4}\.\d/ ) { @use = @dot_comma }
when ( /\d,\d{3}$/ ) { @use = @dot_comma }
when ( /\d\.\d{3}$/ ) { @use = @comma_dot }
when ( /\d,\d/ ) { @use = @comma_dot }
when ( /\d\.\d/ ) { @use = @dot_comma }
default { @use = @dot_comma }
}
return parse_with_separators( $input, @use );
}
sub parse_decimal {
return parse_with_separators( sub {
my $input = shift;
$input =~ s/[^\d,\.]//g;
my @dot_comma = ( $input, '.', ',' );
my @comma_dot = ( $input, ',', '.' );
given ($input) {
when ( $_ !~ /[,\.]/ ) { return @dot_comma }
when ( /\d,\d+\.\d/ ) { return @dot_comma }
when ( /\d\.\d+,\d/ ) { return @comma_dot }
when ( /\d\.\d+\.\d/ ) { return @comma_dot }
when ( /\d,\d+,\d / ) { return @dot_comma }
when ( /\d{4},\d/ ) { return @comma_dot }
when ( /\d{4}\.\d/ ) { return @dot_comma }
when ( /\d,\d{3}$/ ) { return @dot_comma }
when ( /\d\.\d{3}$/ ) { return @comma_dot }
when ( /\d,\d/ ) { return @comma_dot }
when ( /\d\.\d/ ) { return @dot_comma }
default { return @dot_comma }
}
}->( $_[0] ));
}
johntrammell.myopenid.com
November 16, 2009, November 16, 2009 17:32, permalink
Definitely need to get rid of the huge if/else block.
#!/usr/bin/perl
use strict;
use warnings FATAL => 'all';
use Test::More tests => 20;
ok(&parse_decimal("1,234,567") == 1234567);
ok(&parse_decimal("1,234567") == 1.234567);
ok(&parse_decimal("1.234.567") == 1234567);
ok(&parse_decimal("1.234567") == 1.234567);
ok(&parse_decimal("12,345") == 12345);
ok(&parse_decimal("12,345,678") == 12345678);
ok(&parse_decimal("12,345.67") == 12345.67);
ok(&parse_decimal("12,34567") == 12.34567);
ok(&parse_decimal("12.34") == 12.34);
ok(&parse_decimal("12.345") == 12345);
ok(&parse_decimal("12.345,67") == 12345.67);
ok(&parse_decimal("12.345.678") == 12345678);
ok(&parse_decimal("12.34567") == 12.34567);
ok(&parse_decimal("123,4567") == 123.4567);
ok(&parse_decimal("123.4567") == 123.4567);
ok(&parse_decimal("1234,567") == 1234.567);
ok(&parse_decimal("1234.567") == 1234.567);
ok(&parse_decimal("12345") == 12345);
ok(&parse_decimal("12345,67") == 12345.67);
ok(&parse_decimal("1234567") == 1234567);
sub parse_decimal {
(my $input = shift) =~ s/[^\d,\.]//g;
my ($pc, $cp) = ([q(.),q(,)],[q(,),q(.)]);
my @dispatch = (
[ '\d,\d+\.\d' => $pc, ],
[ '\d\.\d+,\d' => $cp, ],
[ '\d\.\d+\.\d' => $cp, ],
[ '\d,\d+,\d' => $pc, ],
[ '\d{4},\d' => $cp, ],
[ '\d{4}\.\d' => $pc, ],
[ '\d,\d{3}$' => $pc, ],
[ '\d\.\d{3}$' => $cp, ],
[ '\d,\d' => $cp, ],
[ '\d.\d' => $pc, ],
);
my @args = @$pc; # default
for my $d (@dispatch) {
next unless $input =~ /$d->[0]/;
@args = @{ $d->[1] };
last;
}
return parse_with_separators($input, @args);
}
sub parse_with_separators {
my $input = shift;
my $decimal_separator = shift;
my $thousand_separator = shift;
my $output = $input;
$output =~ s/\Q${thousand_separator}\E//g;
$output =~ s/\Q${decimal_separator}\E/./g;
return $output;
}
Thomas Salvador
November 16, 2009, November 16, 2009 20:20, permalink
hi.
here an idea on how to simplify this:
cases:
- only ,: if it occurs exactly one time, it is decimal separator, thousands otherwise
- only .: if it occurs exactly one time, it is decimal separator, thousands otherwise
- both , and .:
- the one, that occurs only once is decimal separator, the other one is thousands sep
- if both occur exactly one time each, the rightmost is decimal separator, the other one is thousands sep
...
maybe even simplier:
- the rightmost separator is the decimal separator.
- the second from the right is the thousands separator.
- if both are the same, there is no decimal but there are only thousands separators.
...
hence, imho solution is to determine count and position of the separators.
in the second method it is imho sufficient to get the first two separators from the right.
regards,
thomas.
giftnuss
November 17, 2009, November 17, 2009 10:00, permalink
This is a useable implementation of the ideas above.
sub parse_decimal {
(my $input = shift) =~ s/[^\d,\.]//g;
my %o = (','=>'.','.'=>',');
my @sep = $input =~ /[,\.]/g or return $input;
my ($ds,$ts) = @sep==1
? substr($input,-4,1) eq $sep[0] && index($input,$sep[0]) < 4
? ($o{$sep[0]},$sep[0]) : ($sep[0],$o{$sep[0]})
: (($sep[-1] eq $sep[0] ? $o{$sep[0]} : $sep[-1]),$sep[0]);
parse_with_separators($input,$ds,$ts);
}
Neil Ostrove
January 25, 2010, January 25, 2010 16:28, permalink
This gets rid the duplicated ifs. The regex is a bit complicated but I think it's a straightforward description of grouping. The hack to allow null grouping and decimal separators could be removed by splitting parse_grouping into two functions, but I think this is still clear and more DRY/OAOO.
What do you think?
use strict;
use warnings;
use Test::More;
my %tests = (
"1,234,567" => 1234567, "1,234567" => 1.234567,
"1.234.567" => 1234567, "1.234567" => 1.234567,
"12,345" => 12345, "12,345,678" => 12345678,
"12,345.67" => 12345.67, "12,34567" => 12.34567,
"12.34" => 12.34, "12.345" => 12345,
"12.345,67" => 12345.67, "12.345.678" => 12345678,
"12.34567" => 12.34567, "123,4567" => 123.4567,
"123.4567" => 123.4567, "1234,567" => 1234.567,
"1234.567" => 1234.567, "12345" => 12345,
"12345,67" => 12345.67, "1234567" => 1234567,
);
plan tests => scalar keys %tests;
is( parse_decimal( $_ ), $tests{ $_ } ) for keys %tests;
sub parse_decimal {
my $input = shift;
my $output = parse_grouping($input, ",", "."); # parse as 12,345,67.89
$output = parse_grouping($output, ".", ","); # parse as 12.345.67,89
$output = parse_grouping($output, "", ","); # parse as 1234567,89
return $output
}
sub parse_grouping {
my ($number, $grp, $dec) = @_;
$grp = "[$grp]" if $grp; # make regex character class
$dec = "[$dec]" if $dec; # make regex character class
my $parsed = $number;
if ( $parsed =~ m/^\d{1,3}($grp\d{3})*($dec\d*)?$/ ) {
$parsed =~ s/$grp//g if $grp;
$parsed =~ s/$dec/./g if $dec;
}
return $parsed
}
Neil Ostrove
January 25, 2010, January 25, 2010 16:35, permalink
Natuarally I messed up the comments (which can't be TDDed). See below.
sub parse_decimal {
my $input = shift;
my $output = parse_grouping($input, ",", "."); # parse as 12,345,678.90
$output = parse_grouping($output, ".", ","); # parse as 12.345.678,90
$output = parse_grouping($output, "", ","); # parse as 1234567,890
return $output
}
I'm trying to create a method that provides "best effort" parsing of decimal inputs in cases where I do not know which of these two mutually exclusive ways of writing numbers the end-user is using:
* "." as thousands separator and "," as decimal separator
* "," as thousands separator and "." as decimal separator
The method is implemented as parse_decimal(..) in the code below. Furthermore, I've defined 20 test cases that show how the heuristics of the method should work.
While the code below passes the tests it is quite horrible and unreadable. I'm sure there is a more compact and readable way to implement the method. Possibly including smarter use of regexpes.