package Lingua::TH::Wordbreak;

=head1 TITLE

Lingua::TH::Wordbreak - dictionary based word breaking in Thai

=head1 SYNOPSIS

    use Lingua::TH::Wordbreak;
    
    while (<>)
    { print wordbreak($_); }
    
=head1 DESCRIPTION

Thai word breaking is a fun activity to which everyone is welcome to join in!
Since Thai doesn't have word breaks, it isn't obvious how to break a string
of Thai letters into words. There are many different approaches that have been
tried over the years.

This module uses the currently accepted wisdom that the easiest and most
effective algorithm is to find all the different possible ways of splitting
the phrase into words and then choose the one which results in the least
number of word breaks.

There are two interesting features of this module:

=over 4

=item .

Any word list from any language may be used. There are additional functions for
reading such word lists from files containing one word per line.

=item .

The algorithm works in the presence of unknown words, which can be collected.
Such an unknown list often includes punctuation, numbers, etc. as well as
words not in the dictionary. Obviously the algorithm will not be able to break
within these unknown words.

=back 4

We do this using a suffix tree.

The main function: wordbreak, takes up to five parameters:

    wordbreak($str[, $break_char[, $dict[, $unk[, $trees]]]]);

=over 4

=item str

Contains the string to break.

=item break_char

Character to insert as the word break character, defaults to U+200B

=item dict

A reference to a hash whose keys constitute the dictionary to use for breaking the text.
By default the Thai Royal Institute Dictionary word list will be used.

=item unk

A reference to a hash whose keys will contain all the unknown words the breaking algorithm
identifies. Can be built up progressively over a whole document.

=item trees

The suffix tree to use and update (or create)

=back 4

=cut

=head1 Special functions

use utf8;
use strict;

use File::Spec qw(catfile);
use IO::File;
use Compress::Zlib;
require Exporter;
use vars qw(@ISA %dict);

@ISA = qw(Exporter);

@EXPORT = qw(wordbreak);
@EXPORT_OK = qw(wordbreak add_words %dict);

=head2 add_words(@words)

Adds a list of words to the default dictionary.

=cut

sub add_words
{
    my (@words) = @_;
    
    load_thai unless (defined %dict);
    foreach (@words)
    { $dict{$_}++; }
}

sub wordbreak
{
    my ($str, $break_char, $dict, $unk, $trees) = @_;
    my ($i, $tree, $c, $res, $acc, $tacc, $accrest);

    unless ($dict)
    {
        load_thai() unless (scalar %dict);
        $dict = \%dict;
    }

    use utf8;

    $break_char ||= "\x{200B}";
    $str =~ m/(.)$/o;
    $c = $1;
    if ($dict->{$c})
    { $trees->{$c} = [[$c], []]; }
    else
    { $trees->{$c} = [[$c], [$c], $c]; }
    
    $acc = $str;
    for (($acc, $tacc) = $acc =~ m/(.*)(..)$/o; $tacc; ($acc, $tacc) = $acc =~ m/(.*)(.)$/o)
    {
        $accrest = "$tacc$accrest";
        make_suffix($accrest, $dict, $trees); 
    }
    
    $tree = $trees->{$str};
    foreach $c (@{$tree->[1]})
    { $unk->{$c}++; }
        
    $res = join($break_char, @{$tree->[0]});
    return $res;
}

sub make_suffix
{
    my ($str, $dict, $trees) = @_;
    my ($i, $suff, $unk, @tries);
    my ($first, $rest, $acc, $accrest, $tacc);
    
    use utf8;
    
    $str =~ m/^(.)(.*)/o;
    $first = $1;
    $rest = $2;
    
    $accrest = $str;
    for (($tacc, $accrest) = $accrest =~ m/^(.)(.*)$/o; $tacc; ($tacc, $accrest) = $accrest =~ m/^(.)(.*)$/o)
    {
        $acc .= $tacc;
        $suff = $trees->{$accrest};
        $mysuff = [[@{$suff->[0]}], [@{$suff->[1]}], $suff->[2]];
        if ($dict->{$acc})
        {
            unshift(@{$mysuff->[0]}, $acc);
            $mysuff->[2] = '';
        }
        elsif ($mysuff->[2] ne '')
        {
            $mysuff->[2] = $acc . $mysuff->[2];
            $mysuff->[0][0] = $acc . $mysuff->[0][0];
            $mysuff->[1][0] = $acc . $mysuff->[1][0];
        }
        else
        {
            $mysuff->[2] = $acc;
            unshift (@{$mysuff->[1]}, $acc);
            unshift (@{$mysuff->[0]}, $acc);
        }
        push (@tries, $mysuff);
    }

if(0){    
    unless (scalar @tries)
    {
        $suff = $trees->{$rest};
        $unk = $suff->[2];
        $mysuff = [[@{$suff->[0]}], [@{$suff->[1]}], $suff->[2]];
        if ($unk eq '')
        {
            $unk = $first;
            unshift(@{$mysuff->[0]}, $unk);
            unshift(@{$mysuff->[1]}, $unk);
        }
        else
        {
            my ($test) = $first . $unk;
            my ($found) = 0;
            
            $unk = '';
            while ($test)
            {
                if ($dict->{$test})
                {
                    splice(@{$mysuff->[0]}, 0, 1, $test, $unk);
                    $mysuff->[1][0] = $unk;
                    $found = 1;
                    last;
                }
                ($test, $tunk) = $test =~ m/(.*)(.)$/o;
                $unk = $tunk . $unk;
            }
            unless ($found)
            {
                $mysuff->[0][0] = $unk;
                $mysuff->[1][0] = $unk;
            }
        }
        $mysuff->[2] = $unk;
        push (@tries, $mysuff);
    }
}
    
    @tries = sort {
            length(join('', @{$a->[1]})) <=> length(join('', @{$b->[1]})) ||
            scalar @{$a->[1]} <=> scalar @{$b->[1]} ||
            scalar @{$a->[0]} <=> scalar @{$b->[0]}} @tries;
    $trees->{$str} = $tries[0];
}

sub load_thai
{
    my ($s, $riwords);
    $riwords = require Lingua::TH::Riwords;
    use utf8;
    foreach $s (split(/\n/, $riwords))
    { $dict{$s} = 1 unless ($s =~ m/^.$/o); }
}

1;