package Lingua::TH::Wordbreak; =head1 TITLE Lingua::TH::Wordbreak - dictionary based word breaking in Thai =head1 SYNOPSIS use Lingua::TH::Wordbreak; while (<>) { print wordbreak($_); } =head1 DESCRIPTION Thai word breaking is a fun activity to which everyone is welcome to join in! Since Thai doesn't have word breaks, it isn't obvious how to break a string of Thai letters into words. There are many different approaches that have been tried over the years. This module uses the currently accepted wisdom that the easiest and most effective algorithm is to find all the different possible ways of splitting the phrase into words and then choose the one which results in the least number of word breaks. There are two interesting features of this module: =over 4 =item . Any word list from any language may be used. There are additional functions for reading such word lists from files containing one word per line. =item . The algorithm works in the presence of unknown words, which can be collected. Such an unknown list often includes punctuation, numbers, etc. as well as words not in the dictionary. Obviously the algorithm will not be able to break within these unknown words. =back 4 We do this using a suffix tree. The main function: wordbreak, takes up to five parameters: wordbreak($str[, $break_char[, $dict[, $unk[, $trees]]]]); =over 4 =item str Contains the string to break. =item break_char Character to insert as the word break character, defaults to U+200B =item dict A reference to a hash whose keys constitute the dictionary to use for breaking the text. By default the Thai Royal Institute Dictionary word list will be used. =item unk A reference to a hash whose keys will contain all the unknown words the breaking algorithm identifies. Can be built up progressively over a whole document. =item trees The suffix tree to use and update (or create) =back 4 =cut =head1 Special functions use utf8; use strict; use File::Spec qw(catfile); use IO::File; use Compress::Zlib; require Exporter; use vars qw(@ISA %dict); @ISA = qw(Exporter); @EXPORT = qw(wordbreak); @EXPORT_OK = qw(wordbreak add_words %dict); =head2 add_words(@words) Adds a list of words to the default dictionary. =cut sub add_words { my (@words) = @_; load_thai unless (defined %dict); foreach (@words) { $dict{$_}++; } } sub wordbreak { my ($str, $break_char, $dict, $unk, $trees) = @_; my ($i, $tree, $c, $res, $acc, $tacc, $accrest); unless ($dict) { load_thai() unless (scalar %dict); $dict = \%dict; } use utf8; $break_char ||= "\x{200B}"; $str =~ m/(.)$/o; $c = $1; if ($dict->{$c}) { $trees->{$c} = [[$c], []]; } else { $trees->{$c} = [[$c], [$c], $c]; } $acc = $str; for (($acc, $tacc) = $acc =~ m/(.*)(..)$/o; $tacc; ($acc, $tacc) = $acc =~ m/(.*)(.)$/o) { $accrest = "$tacc$accrest"; make_suffix($accrest, $dict, $trees); } $tree = $trees->{$str}; foreach $c (@{$tree->[1]}) { $unk->{$c}++; } $res = join($break_char, @{$tree->[0]}); return $res; } sub make_suffix { my ($str, $dict, $trees) = @_; my ($i, $suff, $unk, @tries); my ($first, $rest, $acc, $accrest, $tacc); use utf8; $str =~ m/^(.)(.*)/o; $first = $1; $rest = $2; $accrest = $str; for (($tacc, $accrest) = $accrest =~ m/^(.)(.*)$/o; $tacc; ($tacc, $accrest) = $accrest =~ m/^(.)(.*)$/o) { $acc .= $tacc; $suff = $trees->{$accrest}; $mysuff = [[@{$suff->[0]}], [@{$suff->[1]}], $suff->[2]]; if ($dict->{$acc}) { unshift(@{$mysuff->[0]}, $acc); $mysuff->[2] = ''; } elsif ($mysuff->[2] ne '') { $mysuff->[2] = $acc . $mysuff->[2]; $mysuff->[0][0] = $acc . $mysuff->[0][0]; $mysuff->[1][0] = $acc . $mysuff->[1][0]; } else { $mysuff->[2] = $acc; unshift (@{$mysuff->[1]}, $acc); unshift (@{$mysuff->[0]}, $acc); } push (@tries, $mysuff); } if(0){ unless (scalar @tries) { $suff = $trees->{$rest}; $unk = $suff->[2]; $mysuff = [[@{$suff->[0]}], [@{$suff->[1]}], $suff->[2]]; if ($unk eq '') { $unk = $first; unshift(@{$mysuff->[0]}, $unk); unshift(@{$mysuff->[1]}, $unk); } else { my ($test) = $first . $unk; my ($found) = 0; $unk = ''; while ($test) { if ($dict->{$test}) { splice(@{$mysuff->[0]}, 0, 1, $test, $unk); $mysuff->[1][0] = $unk; $found = 1; last; } ($test, $tunk) = $test =~ m/(.*)(.)$/o; $unk = $tunk . $unk; } unless ($found) { $mysuff->[0][0] = $unk; $mysuff->[1][0] = $unk; } } $mysuff->[2] = $unk; push (@tries, $mysuff); } } @tries = sort { length(join('', @{$a->[1]})) <=> length(join('', @{$b->[1]})) || scalar @{$a->[1]} <=> scalar @{$b->[1]} || scalar @{$a->[0]} <=> scalar @{$b->[0]}} @tries; $trees->{$str} = $tries[0]; } sub load_thai { my ($s, $riwords); $riwords = require Lingua::TH::Riwords; use utf8; foreach $s (split(/\n/, $riwords)) { $dict{$s} = 1 unless ($s =~ m/^.$/o); } } 1;