haxed up Butts.pm to use Moose, and filled in a bit more of the POD docs. Nothing /should/ break.
This commit is contained in:
parent
d237998a9d
commit
faf81ac08b
660
Butts.pm
660
Butts.pm
|
@ -27,363 +27,443 @@ Yes.
|
|||
|
||||
=cut
|
||||
|
||||
package Butts;
|
||||
=head1 CONSTRUCTOR
|
||||
|
||||
=over
|
||||
|
||||
=item C<new>
|
||||
|
||||
Takes a number of optional arguments:
|
||||
'debug', sets module debugging output on or off.
|
||||
'hyphen_file', specify a different hyphen file for L<TeX::Hyphen>,
|
||||
defaults to C<$module_dir/hyphen.tex>.
|
||||
'stopwords_file', specify a different source of stopwords,
|
||||
defaults to C<$module_dir/stopwords>.
|
||||
|
||||
=back
|
||||
|
||||
=cut
|
||||
|
||||
use strict;
|
||||
use warnings;
|
||||
|
||||
use Math::Random;
|
||||
use TeX::Hyphen;
|
||||
use Data::Dumper;
|
||||
use FindBin qw($RealBin);
|
||||
use Carp;
|
||||
{
|
||||
|
||||
package Butts;
|
||||
|
||||
use Moose;
|
||||
|
||||
use Math::Random;
|
||||
use TeX::Hyphen;
|
||||
use Data::Dumper;
|
||||
use FindBin qw($RealBin);
|
||||
use Carp;
|
||||
|
||||
|
||||
has 'replace_freq' =>
|
||||
(
|
||||
isa => 'Num',
|
||||
is => 'rw',
|
||||
default => sub { 1/11 }
|
||||
);
|
||||
|
||||
|
||||
has 'meme' =>
|
||||
(
|
||||
isa => 'Str',
|
||||
is => 'rw',
|
||||
default => sub { 'butt' },
|
||||
required => 1,
|
||||
);
|
||||
|
||||
has 'hyphen_file' =>
|
||||
(
|
||||
isa => 'Str',
|
||||
is => 'ro',
|
||||
default => sub { $RealBin . '/hyphen.tex' },
|
||||
);
|
||||
|
||||
has 'stopwords_file' =>
|
||||
(
|
||||
isa => 'Str',
|
||||
is => 'ro',
|
||||
default => sub { $RealBin . '/stopwords' },
|
||||
);
|
||||
|
||||
has 'debug' =>
|
||||
(
|
||||
isa => 'Bool',
|
||||
is => 'rw',
|
||||
required => 1,
|
||||
default => sub { 0 },
|
||||
);
|
||||
|
||||
has 'hyphenator' =>
|
||||
(
|
||||
isa => 'TeX::Hyphen',
|
||||
is => 'ro',
|
||||
lazy => 1,
|
||||
builder => '_build_hyphenator',
|
||||
);
|
||||
|
||||
has 'stopwords' =>
|
||||
(
|
||||
isa => 'HashRef[Str]',
|
||||
is => 'ro',
|
||||
lazy => 1,
|
||||
builder => '_build_stopwords',
|
||||
);
|
||||
|
||||
has 'words' =>
|
||||
(
|
||||
isa => 'ArrayRef[Str]',
|
||||
is => 'ro',
|
||||
writer => '_set_words',
|
||||
default => sub { [] },
|
||||
);
|
||||
|
||||
has 'word_indices' =>
|
||||
(
|
||||
isa => 'ArrayRef[Int]',
|
||||
is => 'ro',
|
||||
default => sub { [] },
|
||||
writer => '_set_word_indices',
|
||||
);
|
||||
|
||||
use fields qw/replace_freq
|
||||
meme
|
||||
hyphen_file
|
||||
stopwords_file
|
||||
hyphenator
|
||||
stopwords
|
||||
debug
|
||||
_words
|
||||
_word_indices/;
|
||||
|
||||
=head1 METHODS
|
||||
|
||||
=cut
|
||||
|
||||
sub new {
|
||||
my Butts $self = shift;
|
||||
unless (ref $self) {
|
||||
$self = fields::new($self);
|
||||
sub _build_hyphenator {
|
||||
my $self = shift;
|
||||
return TeX::Hyphen->new(file => $self->hyphen_file);
|
||||
}
|
||||
|
||||
sub _build_stopwords {
|
||||
my $self = shift;
|
||||
my @stopwords;
|
||||
if (open my $sfh, $self->stopwords_file) {
|
||||
chomp(@stopwords = <$sfh>);
|
||||
close $sfh;
|
||||
} else {
|
||||
carp "Couldn't read stopwords file "
|
||||
. $self->stopwords_file . ' ' . $!;
|
||||
@stopwords = qw/a an and or but it in its It's it's the of you I i/;
|
||||
}
|
||||
|
||||
my %args = (hyphen_file => "$RealBin/hyphen.tex",
|
||||
stopwords_file => "$RealBin/stopwords",
|
||||
debug => 0,
|
||||
meme => 'butt',
|
||||
replace_freq => (1/11), # original value from tef.
|
||||
@_);
|
||||
|
||||
foreach my $key (keys %args) {
|
||||
$self->{$key} = $args{$key};
|
||||
return { map { lc($_) => 1 } @stopwords };
|
||||
}
|
||||
|
||||
$self->{hyphenator} = new TeX::Hyphen((file => $self->{hyphen_file}))
|
||||
or croak "Couldn't create TeX::Hyphen instance from " . $self->{hyphen_file};
|
||||
|
||||
my @stopwords;
|
||||
if (open my $sfh, $self->{stopwords_file}) {
|
||||
chomp(@stopwords = <$sfh>);
|
||||
close $sfh;
|
||||
} else {
|
||||
carp "Couldn't read stopwords file "
|
||||
. $self->{stopwords_file} . ' ' . $!;
|
||||
@stopwords = qw/a an and or but it in its It's it's the of you I i/;
|
||||
}
|
||||
|
||||
$self->{stopwords} = { map { lc($_) => 1 } @stopwords };
|
||||
|
||||
return $self;
|
||||
}
|
||||
|
||||
=head2 meme($value)
|
||||
|
||||
Method which sets / returns the current replacement meme. If called without
|
||||
additional arguments, it returns the current meme. Calling it with a scalar
|
||||
replaces the old meme with a new one.
|
||||
|
||||
=cut
|
||||
|
||||
# accessors
|
||||
sub meme {
|
||||
my $self = shift;
|
||||
if (@_) {
|
||||
$self->{meme} = $_[0];
|
||||
}
|
||||
return $self->{meme}
|
||||
}
|
||||
|
||||
=head2 replace_freq($value)
|
||||
|
||||
Getter/Setter Method for the replacement frequency. Value should be passed as a
|
||||
fractional value, which corresponds to the number of words considered for meme
|
||||
replacement via the following calculation:
|
||||
|
||||
=head2 debug($value)
|
||||
|
||||
Turn debugging output on (C<1>) or off (C<0>). Debugging output is printed to
|
||||
C<STDERR>.
|
||||
|
||||
=cut
|
||||
|
||||
sub replace_freq {
|
||||
my $self = shift;
|
||||
if (@_) {
|
||||
$self->{replace_freq} = $_[0];
|
||||
# helpers
|
||||
sub is_stop_word {
|
||||
my ($self, $word) = @_;
|
||||
return exists $self->stopwords->{lc($word)};
|
||||
}
|
||||
return $self->{replace_freq}
|
||||
}
|
||||
|
||||
# helpers
|
||||
sub is_stop_word {
|
||||
my ($self, $word) = @_;
|
||||
return exists $self->{stopwords}->{lc($word)};
|
||||
}
|
||||
sub is_meme {
|
||||
my ($self, $word) = @_;
|
||||
return lc($word) eq lc($self->meme);
|
||||
|
||||
sub is_meme {
|
||||
my ($self, $word) = @_;
|
||||
return lc($word) eq lc($self->{meme});
|
||||
}
|
||||
|
||||
}
|
||||
sub _split_preserving_whitespace {
|
||||
my ($self, $string) = @_;
|
||||
|
||||
sub split_preserving_whitespace {
|
||||
my ($string) = @_;
|
||||
my ($leading_ws, $remainder) = ($string =~ m/^(\s*)(.*)$/s);
|
||||
$leading_ws //= '';
|
||||
|
||||
my ($leading_ws, $remainder) = ($string =~ m/^(\s*)(.*)$/s);
|
||||
$leading_ws //= '';
|
||||
|
||||
my @all_split = split(/(\s+)/, $remainder);
|
||||
my (@words, @ws);
|
||||
foreach my $tok (@all_split) {
|
||||
if ($tok =~ m/^\s+$/) {
|
||||
push @ws, $tok
|
||||
} else {
|
||||
push @words, $tok
|
||||
my @all_split = split(/(\s+)/, $remainder);
|
||||
my (@words, @ws);
|
||||
foreach my $tok (@all_split) {
|
||||
if ($tok =~ m/^\s+$/) {
|
||||
push @ws, $tok
|
||||
} else {
|
||||
push @words, $tok
|
||||
}
|
||||
}
|
||||
return ($leading_ws, \@words, \@ws);
|
||||
}
|
||||
return ($leading_ws, \@words, \@ws);
|
||||
}
|
||||
|
||||
|
||||
sub reassemble_with_whitespace {
|
||||
my ($leading, $words, $ws) = @_;
|
||||
sub _reassemble_with_whitespace {
|
||||
my ($self, $leading, $words, $ws) = @_;
|
||||
|
||||
# interleave the two arrays. Words always come first, because
|
||||
# any leading space is in $leading.
|
||||
# http://www.perlmonks.org/?node_id=53605
|
||||
# interleave the two arrays. Words always come first, because
|
||||
# any leading space is in $leading.
|
||||
# http://www.perlmonks.org/?node_id=53605
|
||||
|
||||
# if things are different sizes we'll end up with some undefs,
|
||||
# so grep them out.
|
||||
my @ret = grep { defined } map { $words->[$_], $ws->[$_] }
|
||||
0 .. ($#$words > $#$ws ? $#$words : $#$ws);
|
||||
# and convert back to a string.
|
||||
return $leading . join('', @ret);
|
||||
}
|
||||
# if things are different sizes we'll end up with some undefs,
|
||||
# so grep them out.
|
||||
my @ret = grep { defined } map { $words->[$_], $ws->[$_] }
|
||||
0 .. ($#$words > $#$ws ? $#$words : $#$ws);
|
||||
# and convert back to a string.
|
||||
return $leading . join('', @ret);
|
||||
}
|
||||
|
||||
sub buttify_string($_) {
|
||||
my $self = shift;
|
||||
# glom a string from $_ if we didn't get one passed.
|
||||
my $str = (@_ ? $_[0] : $_);
|
||||
chomp($str);
|
||||
=head2 buttify_string
|
||||
|
||||
# FIX for http://code.google.com/p/buttbot/issues/detail?id=7
|
||||
my ($leading, $words, $whitespace)
|
||||
= split_preserving_whitespace($str);
|
||||
This method is the core of Butts.pm. It takes a string argument (or defaults
|
||||
to C<$_> if none is given, and returns a string in which random parts of words
|
||||
have been replaced with the contents of C<$self-E<gt>meme>.
|
||||
|
||||
my @butted_words = $self->buttify(@$words);
|
||||
The original whitespace of the string is preserved as far as possible.
|
||||
|
||||
return reassemble_with_whitespace($leading, \@butted_words, $whitespace);
|
||||
}
|
||||
=cut
|
||||
|
||||
sub buttify {
|
||||
my ($self, @words) = @_;
|
||||
my $how_many_butts = int(@words * $self->{replace_freq}) + 1;
|
||||
my $debug = $self->{debug};
|
||||
sub buttify_string($_) {
|
||||
my $self = shift;
|
||||
# glom a string from $_ if we didn't get one passed.
|
||||
my $str = (@_ ? $_[0] : $_);
|
||||
chomp($str);
|
||||
|
||||
$self->{_words} = \@words;
|
||||
# sort indices by word length
|
||||
my @word_idxs_len_sorted = do {
|
||||
my $c;
|
||||
# FIX for http://code.google.com/p/buttbot/issues/detail?id=7
|
||||
my ($leading, $words, $whitespace)
|
||||
= $self->_split_preserving_whitespace($str);
|
||||
|
||||
map { $_->[0] }
|
||||
sort { $b->[1] <=> $a->[1] }
|
||||
map { [$c++ , length($_) ] } @words;
|
||||
};
|
||||
my @butted_words = $self->buttify(@$words);
|
||||
|
||||
# remove stop words
|
||||
@word_idxs_len_sorted = grep {
|
||||
my $word = $words[$_];
|
||||
return $self->_reassemble_with_whitespace($leading,
|
||||
\@butted_words,
|
||||
$whitespace);
|
||||
}
|
||||
|
||||
my $is_word = $word !~ /^[\d\W+]+$/;
|
||||
my $is_stop = $self->is_stop_word($word);
|
||||
my $is_meme = $self->is_meme($word);
|
||||
=head2 buttify(@words)
|
||||
|
||||
$is_word and not $is_stop and not $is_meme;
|
||||
} @word_idxs_len_sorted;
|
||||
Operates in a similar fashion to L</buttify_string>, but should be passed a
|
||||
pre-tokenised array of words. It returns an array of equal length in which some
|
||||
portion of (some of) the tokens have been replaced by the meme in
|
||||
C<$self-E<gt>meme>.
|
||||
|
||||
$self->{_word_indices} = \@word_idxs_len_sorted;
|
||||
=cut
|
||||
|
||||
sub buttify {
|
||||
my ($self, @words) = @_;
|
||||
my $how_many_butts = int(@words * $self->replace_freq) + 1;
|
||||
my $debug = $self->debug;
|
||||
|
||||
$self->_set_words(\@words);
|
||||
# sort indices by word length
|
||||
my @word_idxs_len_sorted = do {
|
||||
my $c;
|
||||
|
||||
map { $_->[0] }
|
||||
sort { $b->[1] <=> $a->[1] }
|
||||
map { [$c++ , length($_) ] } @words;
|
||||
};
|
||||
|
||||
# remove stop words
|
||||
@word_idxs_len_sorted = grep {
|
||||
my $word = $words[$_];
|
||||
|
||||
my $is_word = $word !~ /^[\d\W+]+$/;
|
||||
my $is_stop = $self->is_stop_word($word);
|
||||
my $is_meme = $self->is_meme($word);
|
||||
|
||||
$is_word and not $is_stop and not $is_meme;
|
||||
} @word_idxs_len_sorted;
|
||||
|
||||
$self->_set_word_indices(\@word_idxs_len_sorted);
|
||||
|
||||
# bail out if we've got nothing left. This happens
|
||||
# when a string is comprised entirely of stop-words.
|
||||
unless (@word_idxs_len_sorted) {
|
||||
$self->log("Couldn't buttify ", join(' ', @words),
|
||||
": entirely stopwords");
|
||||
return @words;
|
||||
}
|
||||
|
||||
# make sure we're not trying to butt too hard.
|
||||
if ($how_many_butts > @word_idxs_len_sorted) {
|
||||
$how_many_butts = scalar(@word_idxs_len_sorted);
|
||||
}
|
||||
|
||||
$self->log("buttifying with $how_many_butts repetitions");
|
||||
my $words_butted = {};
|
||||
my @initial_weights = _sq_weight_indices(scalar @word_idxs_len_sorted);
|
||||
|
||||
# Selecting words to butt works in the following way:
|
||||
|
||||
# * each (non-stop) word-index is assigned a weighting based on it's
|
||||
# ordinal when sorted by (word) length. So, the longest word has weight =
|
||||
# num_words ** 2, second longest is (num_words-1)**2, ...
|
||||
|
||||
# * A random distribution selects some index proportional to its weight.
|
||||
# * The word at this index is butted.
|
||||
# * The index is removed from consideration for subsequent buttings.
|
||||
|
||||
for my $c (0 .. $how_many_butts-1) {
|
||||
my ($xx_n, $xx_p, $xx_x)
|
||||
= $self->_build_weightings_for_index(\@initial_weights, $words_butted);
|
||||
|
||||
my $random_idx = get_walker_rand($xx_n, $xx_p, $xx_x);
|
||||
my $idx_to_butt = $word_idxs_len_sorted[$random_idx];
|
||||
|
||||
$self->log("Butting word idx: $idx_to_butt [",
|
||||
$words[$idx_to_butt], "]");
|
||||
|
||||
$words[$idx_to_butt]
|
||||
= $self->_buttsub($words[$idx_to_butt]);
|
||||
|
||||
$words_butted->{$random_idx} = 1;
|
||||
}
|
||||
|
||||
# bail out if we've got nothing left. This happens
|
||||
# when a string is comprised entirely of stop-words.
|
||||
unless (@word_idxs_len_sorted) {
|
||||
$self->log("Couldn't buttify ", join(' ', @words),
|
||||
": entirely stopwords");
|
||||
return @words;
|
||||
}
|
||||
|
||||
# make sure we're not trying to butt too hard.
|
||||
if ($how_many_butts > @word_idxs_len_sorted) {
|
||||
$how_many_butts = scalar(@word_idxs_len_sorted);
|
||||
}
|
||||
sub _buttsub {
|
||||
my ($self, $word) = @_;
|
||||
|
||||
$self->log("buttifying with $how_many_butts repetitions");
|
||||
my $words_butted = {};
|
||||
my @initial_weights = _sq_weight_indices(scalar @word_idxs_len_sorted);
|
||||
my $meme = $self->meme;
|
||||
|
||||
# Selecting words to butt works in the following way:
|
||||
# split off leading and trailing punctuation
|
||||
my ($lp, $actual_word, $rp) = $word =~ /^([^A-Za-z]*)(.*?)([^A-Za-z]*)$/;
|
||||
|
||||
# * each (non-stop) word-index is assigned a weighting based on it's
|
||||
# ordinal when sorted by (word) length. So, the longest word has weight =
|
||||
# num_words ** 2, second longest is (num_words-1)**2, ...
|
||||
return $word unless $actual_word;
|
||||
|
||||
# * A random distribution selects some index proportional to its weight.
|
||||
# * The word at this index is butted.
|
||||
# * The index is removed from consideration for subsequent buttings.
|
||||
my @points = (0, $self->hyphenator->hyphenate($actual_word));
|
||||
|
||||
for my $c (0 .. $how_many_butts-1) {
|
||||
my ($xx_n, $xx_p, $xx_x)
|
||||
= $self->_build_weightings_for_index(\@initial_weights, $words_butted);
|
||||
my $factor = 2;
|
||||
my $length = scalar @points;
|
||||
my $replace = $length - 1 - int(rand($length ** $factor) ** (1 / $factor));
|
||||
push @points, length($actual_word);
|
||||
|
||||
my $random_idx = get_walker_rand($xx_n, $xx_p, $xx_x);
|
||||
my $idx_to_butt = $word_idxs_len_sorted[$random_idx];
|
||||
my $l = $points[$replace];
|
||||
my $r = $points[$replace + 1] - $l;
|
||||
|
||||
$self->log("Butting word idx: $idx_to_butt [",
|
||||
$words[$idx_to_butt], "]");
|
||||
|
||||
$words[$idx_to_butt]
|
||||
= $self->_buttsub($words[$idx_to_butt]);
|
||||
|
||||
$words_butted->{$random_idx} = 1;
|
||||
}
|
||||
|
||||
return @words;
|
||||
}
|
||||
|
||||
sub _buttsub {
|
||||
my ($self, $word) = @_;
|
||||
|
||||
my $meme = $self->{meme};
|
||||
|
||||
# split off leading and trailing punctuation
|
||||
my ($lp, $actual_word, $rp) = $word =~ /^([^A-Za-z]*)(.*?)([^A-Za-z]*)$/;
|
||||
|
||||
return $word unless $actual_word;
|
||||
|
||||
my @points = (0, $self->{hyphenator}->hyphenate($actual_word));
|
||||
|
||||
my $factor = 2;
|
||||
my $length = scalar @points;
|
||||
my $replace = $length - 1 - int(rand($length ** $factor) ** (1 / $factor));
|
||||
push @points, length($actual_word);
|
||||
|
||||
my $l = $points[$replace];
|
||||
my $r = $points[$replace + 1] - $l;
|
||||
|
||||
while (substr($actual_word, $l + $r, 1) eq 't') {
|
||||
$r++;
|
||||
}
|
||||
while ($l > 0 && substr($actual_word, $l - 1, 1) eq 'b') {
|
||||
$l--;
|
||||
}
|
||||
my $sub = substr($actual_word, $l, $r);
|
||||
my $butt = lc($meme);
|
||||
|
||||
if ($sub eq uc $sub) {
|
||||
$butt = uc($meme);
|
||||
} elsif ($sub =~/^[A-Z]/) {
|
||||
$butt = ucfirst($meme);
|
||||
}
|
||||
|
||||
substr($actual_word, $l, $r) = $butt;
|
||||
return $lp . $actual_word . $rp;
|
||||
}
|
||||
|
||||
sub _build_weightings_for_index {
|
||||
my ($self, $initial_weights, $butted_indices) = @_;
|
||||
|
||||
#$self->log("Word indices remaining: ", @indices);
|
||||
|
||||
my $i = 0;
|
||||
$self->log(Dumper($butted_indices));
|
||||
$self->log(Dumper($initial_weights));
|
||||
|
||||
my @idx_weights = map {
|
||||
exists($butted_indices->{$i++})?0:$_
|
||||
} @$initial_weights;
|
||||
|
||||
my $str;
|
||||
$i = 0;
|
||||
for (@{$self->{_word_indices}}) {
|
||||
$str .= "\tIndex: $_: " . $self->{_words}->[$_]
|
||||
. " ,weight=" . $idx_weights[$i++] . "\n";
|
||||
}
|
||||
$self->log("index weightings:\n" . $str);
|
||||
|
||||
my ($n, $p, $x) = setup_walker_rand(\@idx_weights);
|
||||
return ($n, $p, $x)
|
||||
}
|
||||
|
||||
sub _sq_weight_indices {
|
||||
my $max = shift;
|
||||
return map { $max-- ** 2 } (0..$max-1);
|
||||
}
|
||||
|
||||
|
||||
# stealed frm http://code.activestate.com/recipes/576564/
|
||||
# and http://prxq.wordpress.com/2006/04/17/the-alias-method/
|
||||
# Copyright someone maybe somewhere?
|
||||
sub setup_walker_rand {
|
||||
my ($weight_ref) = @_;
|
||||
|
||||
my @weights = @$weight_ref;
|
||||
my $n = scalar @weights;
|
||||
my @in_x = (-1) x $n;
|
||||
my $sum_w = 0;
|
||||
$sum_w += $_ for @weights;
|
||||
|
||||
# normalise weights to have an average value of 1.
|
||||
@weights = map { $_ * $n / $sum_w } @weights;
|
||||
|
||||
my (@short, @long);
|
||||
my $i = 0;
|
||||
|
||||
# split into long and short groups (excluding those which == 1)
|
||||
for my $p (@weights) {
|
||||
if ($p < 1) {
|
||||
push @short, $i;
|
||||
} elsif ($p > 1) {
|
||||
push @long, $i;
|
||||
while (substr($actual_word, $l + $r, 1) eq 't') {
|
||||
$r++;
|
||||
}
|
||||
$i++;
|
||||
}
|
||||
|
||||
# build alias map by combining short and long elements.
|
||||
while (scalar @short and scalar @long) {
|
||||
my $j = pop @short;
|
||||
my $k = $long[-1];
|
||||
|
||||
$in_x[$j] = $k;
|
||||
$weights[$k] -= (1 - $weights[$j]);
|
||||
|
||||
if ($weights[$k] < 1) {
|
||||
push @short, $k;
|
||||
pop @long;
|
||||
while ($l > 0 && substr($actual_word, $l - 1, 1) eq 'b') {
|
||||
$l--;
|
||||
}
|
||||
# printf("test: j=%d k=%d pk=%.2f\n", $j, $k, $prob[$k]);
|
||||
my $sub = substr($actual_word, $l, $r);
|
||||
my $butt = lc($meme);
|
||||
|
||||
if ($sub eq uc $sub) {
|
||||
$butt = uc($meme);
|
||||
} elsif ($sub =~/^[A-Z]/) {
|
||||
$butt = ucfirst($meme);
|
||||
}
|
||||
|
||||
substr($actual_word, $l, $r) = $butt;
|
||||
return $lp . $actual_word . $rp;
|
||||
}
|
||||
return ($n, \@weights, \@in_x)
|
||||
}
|
||||
|
||||
sub get_walker_rand {
|
||||
my ($n, $prob, $in_x) = @_;
|
||||
my ($u, $j);
|
||||
$u = random_uniform(1,0,1);
|
||||
$j = random_uniform_integer(1, 0, $n-1);
|
||||
return ($u <= $prob->[$j]) ? $j : $in_x->[$j];
|
||||
}
|
||||
sub _build_weightings_for_index {
|
||||
my ($self, $initial_weights, $butted_indices) = @_;
|
||||
|
||||
sub log {
|
||||
my ($self, @msg) = @_;
|
||||
if ($self->{debug}) {
|
||||
print STDERR join(" ", @msg) . $/;
|
||||
#$self->log("Word indices remaining: ", @indices);
|
||||
|
||||
my $i = 0;
|
||||
|
||||
if ($self->debug) {
|
||||
$self->log(Dumper($butted_indices));
|
||||
$self->log(Dumper($initial_weights));
|
||||
}
|
||||
|
||||
my @idx_weights = map {
|
||||
exists($butted_indices->{$i++})?0:$_
|
||||
} @$initial_weights;
|
||||
|
||||
my $str;
|
||||
$i = 0;
|
||||
for (@{$self->word_indices}) {
|
||||
$str .= "\tIndex: $_: " . $self->words->[$_]
|
||||
. " ,weight=" . $idx_weights[$i++] . "\n";
|
||||
}
|
||||
$self->log("index weightings:\n" . $str);
|
||||
|
||||
my ($n, $p, $x) = setup_walker_rand(\@idx_weights);
|
||||
return ($n, $p, $x)
|
||||
}
|
||||
}
|
||||
|
||||
1;
|
||||
sub _sq_weight_indices {
|
||||
my $max = shift;
|
||||
return map { $max-- ** 2 } (0..$max-1);
|
||||
}
|
||||
|
||||
|
||||
# stealed from http://code.activestate.com/recipes/576564/
|
||||
# and http://prxq.wordpress.com/2006/04/17/the-alias-method/
|
||||
# Copyright someone maybe somewhere?
|
||||
sub setup_walker_rand {
|
||||
my ($weight_ref) = @_;
|
||||
|
||||
my @weights = @$weight_ref;
|
||||
my $n = scalar @weights;
|
||||
my @in_x = (-1) x $n;
|
||||
my $sum_w = 0;
|
||||
$sum_w += $_ for @weights;
|
||||
|
||||
# normalise weights to have an average value of 1.
|
||||
@weights = map { $_ * $n / $sum_w } @weights;
|
||||
|
||||
my (@short, @long);
|
||||
my $i = 0;
|
||||
|
||||
# split into long and short groups (excluding those which == 1)
|
||||
for my $p (@weights) {
|
||||
if ($p < 1) {
|
||||
push @short, $i;
|
||||
} elsif ($p > 1) {
|
||||
push @long, $i;
|
||||
}
|
||||
$i++;
|
||||
}
|
||||
|
||||
# build alias map by combining short and long elements.
|
||||
while (scalar @short and scalar @long) {
|
||||
my $j = pop @short;
|
||||
my $k = $long[-1];
|
||||
|
||||
$in_x[$j] = $k;
|
||||
$weights[$k] -= (1 - $weights[$j]);
|
||||
|
||||
if ($weights[$k] < 1) {
|
||||
push @short, $k;
|
||||
pop @long;
|
||||
}
|
||||
# printf("test: j=%d k=%d pk=%.2f\n", $j, $k, $prob[$k]);
|
||||
}
|
||||
return ($n, \@weights, \@in_x)
|
||||
}
|
||||
|
||||
sub get_walker_rand {
|
||||
my ($n, $prob, $in_x) = @_;
|
||||
my ($u, $j);
|
||||
$u = random_uniform(1,0,1);
|
||||
$j = random_uniform_integer(1, 0, $n-1);
|
||||
return ($u <= $prob->[$j]) ? $j : $in_x->[$j];
|
||||
}
|
||||
|
||||
sub log {
|
||||
my ($self, @msg) = @_;
|
||||
if ($self->debug) {
|
||||
print STDERR join(" ", @msg) . $/;
|
||||
}
|
||||
}
|
||||
|
||||
no Moose;
|
||||
__PACKAGE__->meta->make_immutable;
|
||||
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue