use Lingua::TH::Wordbreak; use Getopt::Std; use Encode::Registry; use IO::File; use Benchmark; getopts('a:c:d:m:p:stu:z:'); unless ($opt_t || defined $ARGV[1]) { die <<'EOT'; thaisplit [-a file] [-c hex] [-m mapping] [-s] [-u file] infile outfile Word breaks a text according to the dictionary given. -a file Additional dictionary entries -c hex byte to use for wordbreak [/] -m mapping Mapping identifier to use -s skip SFMs -u file List of unknown words encountered EOT } $time_start = new Benchmark if ($opt_z & 2); $opt_m ||= 'thai-upc'; $enc = find_encoding($opt_m) || die "Can't find $opt_m encoding"; if ($opt_c) { $opt_c = $enc->decode(pack('C', hex($opt_c))); } else { $opt_c = '/'; } if ($opt_a) { my (@words); $fh = IO::File->new("< $opt_a") || die "Can't open $opt_a for reading"; while (<$fh>) { chomp; next unless $_; push (@words, $enc->decode($_)); } $fh->close(); Lingua::TH::Wordbreak::add_words(@words); } unless ($opt_t) { $fhin = IO::File->new("< $ARGV[0]") || die "Can't open $ARGV[0]"; $fhout = IO::File->new("> $ARGV[1]") || die "Can't write to $ARGV[1]"; $trees = $opt_p ? {} : undef; $count = 0; while (<$fhin>) { use utf8; print STDERR "." if (++$count % 100 == 0); $res = ''; chomp; $_ = $enc->decode($_); if ($opt_s) { if (s/^\\v\s+([0-9-]+)\s*//o) { $res = "\\v $1 "; } elsif (s/^\\(\S+)\s*//o) { $res = "\\$1 "; } } foreach $w (split(/\s+/, $_)) { $res .= Lingua::TH::Wordbreak::wordbreak($w, $opt_c, undef, undef, $trees) . ($opt_c ? "$opt_c " : " "); } $res =~ s/.$//o; # $fhout->print($enc->encode($res) . "\n"); $fhout->print("$res\n"); if ($opt_p) { foreach $w (keys %{$trees}) { delete $trees->{$w} unless (length($w) <= $opt_p * 3); } } } print STDERR "Trees are: " . scalar(%{$trees}) . "\n" if ($opt_p && $opt_z & 1); $fhin->close(); $fhout->close(); } else { $test = 'อานุุภาพของพระวิญญาณบริสุทธิ์แก‹เหล‹าอัครทูต'; print Lingua::TH::Wordbreak::wordbreak($enc->decode($test), $opt_c); } if ($opt_u && scalar keys %unk) { $fh = IO::File->new("> $opt_u") || die "Can't write $opt_u"; foreach $w (sort keys %unk) { $fh->print($enc->encode($w) . "\n"); } $fh->close(); } if ($opt_z & 2) { $time_end = new Benchmark; print STDERR "Timings: " . timestr(timediff($time_end, $time_start)) . "\n"; }