2009-11-11 06:49:06 -08:00
|
|
|
=head1 NAME
|
|
|
|
|
|
|
|
Butts - replace random syllables with the arbitrary memes.
|
|
|
|
|
|
|
|
=head1 SYNOPSIS
|
|
|
|
|
|
|
|
# with all defaults
|
|
|
|
my $butter = Butts->new;
|
|
|
|
$butter->buttify_string("hello there");
|
|
|
|
|
|
|
|
# with all known options
|
|
|
|
my $butter = Butts->new(
|
|
|
|
meme => 'butt',
|
|
|
|
replace_freq => (1/11),
|
|
|
|
debug => 0,
|
|
|
|
hyphen_file => 'hyphens.tex',
|
|
|
|
stopwords_file => 'stopwords',
|
|
|
|
);
|
|
|
|
|
|
|
|
$butter->buttify(@tokens);
|
|
|
|
$butter->buttify_string($string);
|
|
|
|
|
|
|
|
|
|
|
|
=head1 DESCRIPTION
|
|
|
|
|
|
|
|
Yes.
|
|
|
|
|
|
|
|
=cut
|
|
|
|
|
2009-11-11 16:21:35 -08:00
|
|
|
=head1 CONSTRUCTOR
|
2007-12-11 00:50:39 -08:00
|
|
|
|
2009-11-11 16:21:35 -08:00
|
|
|
=over
|
2007-12-11 00:41:21 -08:00
|
|
|
|
2009-11-11 16:21:35 -08:00
|
|
|
=item C<new>
|
2009-10-28 09:02:56 -07:00
|
|
|
|
2009-11-11 16:21:35 -08:00
|
|
|
Takes a number of optional arguments:
|
|
|
|
'debug', sets module debugging output on or off.
|
|
|
|
'hyphen_file', specify a different hyphen file for L<TeX::Hyphen>,
|
|
|
|
defaults to C<$module_dir/hyphen.tex>.
|
|
|
|
'stopwords_file', specify a different source of stopwords,
|
|
|
|
defaults to C<$module_dir/stopwords>.
|
|
|
|
|
|
|
|
=back
|
2009-11-11 06:49:06 -08:00
|
|
|
|
|
|
|
=cut
|
|
|
|
|
2009-11-11 16:21:35 -08:00
|
|
|
use strict;
|
|
|
|
use warnings;
|
2009-10-28 09:02:56 -07:00
|
|
|
|
2009-11-11 16:21:35 -08:00
|
|
|
{
|
|
|
|
|
|
|
|
package Butts;
|
|
|
|
|
|
|
|
use Moose;
|
|
|
|
|
|
|
|
use Math::Random;
|
|
|
|
use TeX::Hyphen;
|
|
|
|
use Data::Dumper;
|
2009-11-11 16:26:01 -08:00
|
|
|
use Dir::Self;
|
2009-11-11 16:21:35 -08:00
|
|
|
use Carp;
|
|
|
|
|
|
|
|
|
|
|
|
has 'replace_freq' =>
|
|
|
|
(
|
|
|
|
isa => 'Num',
|
|
|
|
is => 'rw',
|
|
|
|
default => sub { 1/11 }
|
|
|
|
);
|
|
|
|
|
|
|
|
|
|
|
|
has 'meme' =>
|
|
|
|
(
|
|
|
|
isa => 'Str',
|
|
|
|
is => 'rw',
|
|
|
|
default => sub { 'butt' },
|
|
|
|
required => 1,
|
|
|
|
);
|
|
|
|
|
|
|
|
has 'hyphen_file' =>
|
|
|
|
(
|
|
|
|
isa => 'Str',
|
|
|
|
is => 'ro',
|
2009-11-11 16:26:01 -08:00
|
|
|
default => sub { __DIR__ . '/hyphen.tex' },
|
2009-11-11 16:21:35 -08:00
|
|
|
);
|
|
|
|
|
|
|
|
has 'stopwords_file' =>
|
|
|
|
(
|
|
|
|
isa => 'Str',
|
|
|
|
is => 'ro',
|
2009-11-11 16:26:01 -08:00
|
|
|
default => sub { __DIR__ . '/stopwords' },
|
2009-11-11 16:21:35 -08:00
|
|
|
);
|
|
|
|
|
|
|
|
has 'debug' =>
|
|
|
|
(
|
|
|
|
isa => 'Bool',
|
|
|
|
is => 'rw',
|
|
|
|
required => 1,
|
|
|
|
default => sub { 0 },
|
|
|
|
);
|
|
|
|
|
|
|
|
has 'hyphenator' =>
|
|
|
|
(
|
|
|
|
isa => 'TeX::Hyphen',
|
|
|
|
is => 'ro',
|
|
|
|
lazy => 1,
|
|
|
|
builder => '_build_hyphenator',
|
|
|
|
);
|
|
|
|
|
|
|
|
has 'stopwords' =>
|
|
|
|
(
|
|
|
|
isa => 'HashRef[Str]',
|
|
|
|
is => 'ro',
|
|
|
|
lazy => 1,
|
|
|
|
builder => '_build_stopwords',
|
|
|
|
);
|
|
|
|
|
|
|
|
has 'words' =>
|
|
|
|
(
|
|
|
|
isa => 'ArrayRef[Str]',
|
|
|
|
is => 'ro',
|
|
|
|
writer => '_set_words',
|
|
|
|
default => sub { [] },
|
|
|
|
);
|
|
|
|
|
|
|
|
has 'word_indices' =>
|
|
|
|
(
|
|
|
|
isa => 'ArrayRef[Int]',
|
|
|
|
is => 'ro',
|
|
|
|
default => sub { [] },
|
|
|
|
writer => '_set_word_indices',
|
|
|
|
);
|
2009-11-10 07:18:52 -08:00
|
|
|
|
2009-10-28 09:02:56 -07:00
|
|
|
|
2009-11-11 16:21:35 -08:00
|
|
|
=head1 METHODS
|
2009-10-28 09:02:56 -07:00
|
|
|
|
2009-11-11 16:21:35 -08:00
|
|
|
=cut
|
|
|
|
|
|
|
|
sub _build_hyphenator {
|
|
|
|
my $self = shift;
|
|
|
|
return TeX::Hyphen->new(file => $self->hyphen_file);
|
2009-10-28 09:02:56 -07:00
|
|
|
}
|
|
|
|
|
2009-11-11 16:21:35 -08:00
|
|
|
sub _build_stopwords {
|
|
|
|
my $self = shift;
|
|
|
|
my @stopwords;
|
|
|
|
if (open my $sfh, $self->stopwords_file) {
|
|
|
|
chomp(@stopwords = <$sfh>);
|
|
|
|
close $sfh;
|
|
|
|
} else {
|
|
|
|
carp "Couldn't read stopwords file "
|
|
|
|
. $self->stopwords_file . ' ' . $!;
|
|
|
|
@stopwords = qw/a an and or but it in its It's it's the of you I i/;
|
|
|
|
}
|
2009-10-28 09:02:56 -07:00
|
|
|
|
2009-11-11 16:21:35 -08:00
|
|
|
return { map { lc($_) => 1 } @stopwords };
|
|
|
|
}
|
2007-12-10 20:29:18 -08:00
|
|
|
|
2009-11-11 06:49:06 -08:00
|
|
|
=head2 meme($value)
|
|
|
|
|
|
|
|
Method which sets / returns the current replacement meme. If called without
|
|
|
|
additional arguments, it returns the current meme. Calling it with a scalar
|
|
|
|
replaces the old meme with a new one.
|
|
|
|
|
|
|
|
=head2 replace_freq($value)
|
|
|
|
|
|
|
|
Getter/Setter Method for the replacement frequency. Value should be passed as a
|
|
|
|
fractional value, which corresponds to the number of words considered for meme
|
|
|
|
replacement via the following calculation:
|
|
|
|
|
2009-11-11 16:21:35 -08:00
|
|
|
=head2 debug($value)
|
|
|
|
|
|
|
|
Turn debugging output on (C<1>) or off (C<0>). Debugging output is printed to
|
|
|
|
C<STDERR>.
|
|
|
|
|
2009-11-11 06:49:06 -08:00
|
|
|
=cut
|
|
|
|
|
2009-11-11 16:21:35 -08:00
|
|
|
# helpers
|
|
|
|
sub is_stop_word {
|
|
|
|
my ($self, $word) = @_;
|
|
|
|
return exists $self->stopwords->{lc($word)};
|
2009-10-28 11:38:10 -07:00
|
|
|
}
|
2009-12-07 07:45:27 -08:00
|
|
|
sub is_url {
|
|
|
|
my ($self, $word) = @_;
|
|
|
|
return $word =~ /^https?:\/\//i;
|
|
|
|
}
|
2007-12-10 20:29:18 -08:00
|
|
|
|
2009-11-11 16:21:35 -08:00
|
|
|
sub is_meme {
|
|
|
|
my ($self, $word) = @_;
|
|
|
|
return lc($word) eq lc($self->meme);
|
2009-10-28 09:02:56 -07:00
|
|
|
|
2009-11-11 16:21:35 -08:00
|
|
|
}
|
2008-06-16 18:05:42 -07:00
|
|
|
|
2009-11-11 16:21:35 -08:00
|
|
|
sub _split_preserving_whitespace {
|
|
|
|
my ($self, $string) = @_;
|
2009-10-28 14:57:59 -07:00
|
|
|
|
2009-11-11 16:21:35 -08:00
|
|
|
my ($leading_ws, $remainder) = ($string =~ m/^(\s*)(.*)$/s);
|
2010-08-05 09:07:42 -07:00
|
|
|
$leading_ws = defined $leading_ws ? $leading_ws : '';
|
2009-10-28 14:57:59 -07:00
|
|
|
|
2009-11-11 16:21:35 -08:00
|
|
|
my @all_split = split(/(\s+)/, $remainder);
|
|
|
|
my (@words, @ws);
|
|
|
|
foreach my $tok (@all_split) {
|
|
|
|
if ($tok =~ m/^\s+$/) {
|
|
|
|
push @ws, $tok
|
|
|
|
} else {
|
|
|
|
push @words, $tok
|
|
|
|
}
|
2009-10-28 14:57:59 -07:00
|
|
|
}
|
2009-11-11 16:21:35 -08:00
|
|
|
return ($leading_ws, \@words, \@ws);
|
2009-10-28 14:57:59 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
|
2009-11-11 16:21:35 -08:00
|
|
|
sub _reassemble_with_whitespace {
|
|
|
|
my ($self, $leading, $words, $ws) = @_;
|
2009-10-28 14:57:59 -07:00
|
|
|
|
2009-11-11 16:21:35 -08:00
|
|
|
# interleave the two arrays. Words always come first, because
|
|
|
|
# any leading space is in $leading.
|
|
|
|
# http://www.perlmonks.org/?node_id=53605
|
2009-10-28 14:57:59 -07:00
|
|
|
|
2009-11-11 16:21:35 -08:00
|
|
|
# if things are different sizes we'll end up with some undefs,
|
|
|
|
# so grep them out.
|
|
|
|
my @ret = grep { defined } map { $words->[$_], $ws->[$_] }
|
|
|
|
0 .. ($#$words > $#$ws ? $#$words : $#$ws);
|
|
|
|
# and convert back to a string.
|
|
|
|
return $leading . join('', @ret);
|
|
|
|
}
|
2009-10-28 14:57:59 -07:00
|
|
|
|
2009-11-11 16:21:35 -08:00
|
|
|
=head2 buttify_string
|
2009-10-28 14:57:59 -07:00
|
|
|
|
2009-11-11 16:21:35 -08:00
|
|
|
This method is the core of Butts.pm. It takes a string argument (or defaults
|
|
|
|
to C<$_> if none is given, and returns a string in which random parts of words
|
|
|
|
have been replaced with the contents of C<$self-E<gt>meme>.
|
2009-10-28 14:57:59 -07:00
|
|
|
|
2009-11-11 16:21:35 -08:00
|
|
|
The original whitespace of the string is preserved as far as possible.
|
2008-06-12 12:28:55 -07:00
|
|
|
|
2009-11-11 16:21:35 -08:00
|
|
|
=cut
|
2007-12-10 20:29:18 -08:00
|
|
|
|
2009-11-11 16:21:35 -08:00
|
|
|
sub buttify_string($_) {
|
|
|
|
my $self = shift;
|
|
|
|
# glom a string from $_ if we didn't get one passed.
|
|
|
|
my $str = (@_ ? $_[0] : $_);
|
|
|
|
chomp($str);
|
2008-06-17 06:02:34 -07:00
|
|
|
|
2009-11-11 16:21:35 -08:00
|
|
|
# FIX for http://code.google.com/p/buttbot/issues/detail?id=7
|
|
|
|
my ($leading, $words, $whitespace)
|
|
|
|
= $self->_split_preserving_whitespace($str);
|
2007-12-10 20:29:18 -08:00
|
|
|
|
2009-11-11 16:21:35 -08:00
|
|
|
my @butted_words = $self->buttify(@$words);
|
2008-06-17 06:32:34 -07:00
|
|
|
|
2009-11-11 16:21:35 -08:00
|
|
|
return $self->_reassemble_with_whitespace($leading,
|
|
|
|
\@butted_words,
|
|
|
|
$whitespace);
|
|
|
|
}
|
2008-06-17 06:32:34 -07:00
|
|
|
|
2009-11-11 16:21:35 -08:00
|
|
|
=head2 buttify(@words)
|
2009-10-28 09:02:56 -07:00
|
|
|
|
2009-11-11 16:21:35 -08:00
|
|
|
Operates in a similar fashion to L</buttify_string>, but should be passed a
|
|
|
|
pre-tokenised array of words. It returns an array of equal length in which some
|
|
|
|
portion of (some of) the tokens have been replaced by the meme in
|
|
|
|
C<$self-E<gt>meme>.
|
2009-10-28 14:57:59 -07:00
|
|
|
|
2009-11-11 16:21:35 -08:00
|
|
|
=cut
|
2009-10-28 09:02:56 -07:00
|
|
|
|
2009-11-11 16:21:35 -08:00
|
|
|
sub buttify {
|
|
|
|
my ($self, @words) = @_;
|
|
|
|
my $how_many_butts = int(@words * $self->replace_freq) + 1;
|
|
|
|
my $debug = $self->debug;
|
2009-10-28 10:01:41 -07:00
|
|
|
|
2009-11-11 16:21:35 -08:00
|
|
|
$self->_set_words(\@words);
|
|
|
|
# sort indices by word length
|
|
|
|
my @word_idxs_len_sorted = do {
|
|
|
|
my $c;
|
2009-10-28 14:57:59 -07:00
|
|
|
|
2009-11-11 16:21:35 -08:00
|
|
|
map { $_->[0] }
|
|
|
|
sort { $b->[1] <=> $a->[1] }
|
|
|
|
map { [$c++ , length($_) ] } @words;
|
|
|
|
};
|
2009-10-28 11:38:10 -07:00
|
|
|
|
2009-11-11 16:21:35 -08:00
|
|
|
# remove stop words
|
|
|
|
@word_idxs_len_sorted = grep {
|
|
|
|
my $word = $words[$_];
|
2009-10-28 14:57:59 -07:00
|
|
|
|
2009-11-11 16:21:35 -08:00
|
|
|
my $is_word = $word !~ /^[\d\W+]+$/;
|
|
|
|
my $is_stop = $self->is_stop_word($word);
|
|
|
|
my $is_meme = $self->is_meme($word);
|
2009-12-07 07:45:27 -08:00
|
|
|
my $is_url = $self->is_url($word);
|
2009-10-28 10:01:41 -07:00
|
|
|
|
2009-12-07 07:45:27 -08:00
|
|
|
$is_word and not $is_stop and not $is_meme and not $is_url;
|
2009-11-11 16:21:35 -08:00
|
|
|
} @word_idxs_len_sorted;
|
2009-10-28 09:02:56 -07:00
|
|
|
|
2009-11-11 16:21:35 -08:00
|
|
|
$self->_set_word_indices(\@word_idxs_len_sorted);
|
2009-10-28 11:38:10 -07:00
|
|
|
|
2009-11-11 16:21:35 -08:00
|
|
|
# bail out if we've got nothing left. This happens
|
|
|
|
# when a string is comprised entirely of stop-words.
|
|
|
|
unless (@word_idxs_len_sorted) {
|
|
|
|
$self->log("Couldn't buttify ", join(' ', @words),
|
|
|
|
": entirely stopwords");
|
|
|
|
return @words;
|
|
|
|
}
|
2009-10-28 11:38:10 -07:00
|
|
|
|
2009-11-11 16:21:35 -08:00
|
|
|
# make sure we're not trying to butt too hard.
|
|
|
|
if ($how_many_butts > @word_idxs_len_sorted) {
|
|
|
|
$how_many_butts = scalar(@word_idxs_len_sorted);
|
|
|
|
}
|
2009-10-28 11:38:10 -07:00
|
|
|
|
2009-11-11 16:21:35 -08:00
|
|
|
$self->log("buttifying with $how_many_butts repetitions");
|
|
|
|
my $words_butted = {};
|
|
|
|
my @initial_weights = _sq_weight_indices(scalar @word_idxs_len_sorted);
|
2007-12-10 20:29:18 -08:00
|
|
|
|
2009-11-11 16:21:35 -08:00
|
|
|
# Selecting words to butt works in the following way:
|
2007-12-10 20:29:18 -08:00
|
|
|
|
2009-11-11 16:21:35 -08:00
|
|
|
# * each (non-stop) word-index is assigned a weighting based on it's
|
|
|
|
# ordinal when sorted by (word) length. So, the longest word has weight =
|
|
|
|
# num_words ** 2, second longest is (num_words-1)**2, ...
|
2009-10-28 09:02:56 -07:00
|
|
|
|
2009-11-11 16:21:35 -08:00
|
|
|
# * A random distribution selects some index proportional to its weight.
|
|
|
|
# * The word at this index is butted.
|
|
|
|
# * The index is removed from consideration for subsequent buttings.
|
2008-11-02 16:19:48 -08:00
|
|
|
|
2009-11-11 16:21:35 -08:00
|
|
|
for my $c (0 .. $how_many_butts-1) {
|
|
|
|
my ($xx_n, $xx_p, $xx_x)
|
|
|
|
= $self->_build_weightings_for_index(\@initial_weights, $words_butted);
|
2008-11-02 16:19:48 -08:00
|
|
|
|
2009-11-11 16:21:35 -08:00
|
|
|
my $random_idx = get_walker_rand($xx_n, $xx_p, $xx_x);
|
|
|
|
my $idx_to_butt = $word_idxs_len_sorted[$random_idx];
|
2009-10-28 14:57:59 -07:00
|
|
|
|
2009-11-11 16:21:35 -08:00
|
|
|
$self->log("Butting word idx: $idx_to_butt [",
|
|
|
|
$words[$idx_to_butt], "]");
|
2008-11-02 16:19:48 -08:00
|
|
|
|
2009-11-11 16:21:35 -08:00
|
|
|
$words[$idx_to_butt]
|
|
|
|
= $self->_buttsub($words[$idx_to_butt]);
|
2008-11-02 16:19:48 -08:00
|
|
|
|
2009-11-11 16:21:35 -08:00
|
|
|
$words_butted->{$random_idx} = 1;
|
|
|
|
}
|
2009-10-28 14:57:59 -07:00
|
|
|
|
2009-11-11 16:21:35 -08:00
|
|
|
return @words;
|
2009-10-28 09:02:56 -07:00
|
|
|
}
|
2008-11-02 16:19:48 -08:00
|
|
|
|
2010-04-30 07:40:58 -07:00
|
|
|
sub _find_repeating_vowel {
|
|
|
|
my ($self, $word) = @_;
|
|
|
|
my $vowels = "aeiouAEIOU";
|
|
|
|
|
|
|
|
my $j = 0;
|
|
|
|
my $j_record = 0;
|
|
|
|
my $k_record = 0;
|
|
|
|
while ($j < length($word)) {
|
|
|
|
if (index($vowels, substr($word,$j,1)) > -1) {
|
|
|
|
# $word[$j] is a vowel; how many times does it repeat?
|
|
|
|
my $k = 0;
|
|
|
|
do {
|
|
|
|
++$k;
|
|
|
|
} while (($j + $k < length($word)) && (substr($word,$j+$k,1) eq substr($word,$j,1)));
|
|
|
|
# save the vowel that repeats most
|
|
|
|
if ($k > $k_record) {
|
|
|
|
$j_record = $j;
|
|
|
|
$k_record = $k;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
++$j;
|
|
|
|
}
|
|
|
|
return ($j_record, $k_record);
|
|
|
|
}
|
|
|
|
|
2009-11-11 16:21:35 -08:00
|
|
|
sub _buttsub {
|
|
|
|
my ($self, $word) = @_;
|
2009-10-28 14:57:59 -07:00
|
|
|
|
2009-11-11 16:21:35 -08:00
|
|
|
my $meme = $self->meme;
|
2009-10-28 11:38:10 -07:00
|
|
|
|
2009-11-11 16:21:35 -08:00
|
|
|
# split off leading and trailing punctuation
|
|
|
|
my ($lp, $actual_word, $rp) = $word =~ /^([^A-Za-z]*)(.*?)([^A-Za-z]*)$/;
|
2009-10-28 11:38:10 -07:00
|
|
|
|
2009-11-11 16:21:35 -08:00
|
|
|
return $word unless $actual_word;
|
2009-10-28 11:38:10 -07:00
|
|
|
|
2009-11-11 16:21:35 -08:00
|
|
|
my @points = (0, $self->hyphenator->hyphenate($actual_word));
|
2009-10-28 11:38:10 -07:00
|
|
|
|
2009-11-11 16:21:35 -08:00
|
|
|
my $factor = 2;
|
|
|
|
my $length = scalar @points;
|
|
|
|
my $replace = $length - 1 - int(rand($length ** $factor) ** (1 / $factor));
|
|
|
|
push @points, length($actual_word);
|
2009-10-28 14:57:59 -07:00
|
|
|
|
2009-11-11 16:21:35 -08:00
|
|
|
my $l = $points[$replace];
|
|
|
|
my $r = $points[$replace + 1] - $l;
|
2009-10-28 11:38:10 -07:00
|
|
|
|
2009-11-11 16:21:35 -08:00
|
|
|
while (substr($actual_word, $l + $r, 1) eq 't') {
|
|
|
|
$r++;
|
|
|
|
}
|
|
|
|
while ($l > 0 && substr($actual_word, $l - 1, 1) eq 'b') {
|
|
|
|
$l--;
|
|
|
|
}
|
|
|
|
my $sub = substr($actual_word, $l, $r);
|
|
|
|
my $butt = lc($meme);
|
2007-12-10 20:29:18 -08:00
|
|
|
|
2009-11-11 16:21:35 -08:00
|
|
|
if ($sub eq uc $sub) {
|
|
|
|
$butt = uc($meme);
|
|
|
|
} elsif ($sub =~/^[A-Z]/) {
|
|
|
|
$butt = ucfirst($meme);
|
|
|
|
}
|
2008-06-17 04:31:20 -07:00
|
|
|
|
2010-04-30 07:40:58 -07:00
|
|
|
my ($j, $k) = $self->_find_repeating_vowel($sub);
|
|
|
|
|
|
|
|
if ($k > 2) {
|
|
|
|
my $k2;
|
|
|
|
($j, $k2) = $self->_find_repeating_vowel($butt);
|
|
|
|
substr($butt, $j, 1) = substr($butt, $j, 1) x $k;
|
|
|
|
}
|
|
|
|
|
2009-11-11 16:21:35 -08:00
|
|
|
substr($actual_word, $l, $r) = $butt;
|
|
|
|
return $lp . $actual_word . $rp;
|
|
|
|
}
|
2008-06-17 04:31:20 -07:00
|
|
|
|
2009-11-11 16:21:35 -08:00
|
|
|
sub _build_weightings_for_index {
|
|
|
|
my ($self, $initial_weights, $butted_indices) = @_;
|
2009-10-28 09:02:56 -07:00
|
|
|
|
2009-11-11 16:21:35 -08:00
|
|
|
#$self->log("Word indices remaining: ", @indices);
|
2009-10-28 09:02:56 -07:00
|
|
|
|
2009-11-11 16:21:35 -08:00
|
|
|
my $i = 0;
|
2009-10-28 09:02:56 -07:00
|
|
|
|
2009-11-11 16:21:35 -08:00
|
|
|
if ($self->debug) {
|
|
|
|
$self->log(Dumper($butted_indices));
|
|
|
|
$self->log(Dumper($initial_weights));
|
|
|
|
}
|
2009-10-28 09:02:56 -07:00
|
|
|
|
2009-11-11 16:21:35 -08:00
|
|
|
my @idx_weights = map {
|
|
|
|
exists($butted_indices->{$i++})?0:$_
|
|
|
|
} @$initial_weights;
|
|
|
|
|
|
|
|
my $str;
|
|
|
|
$i = 0;
|
|
|
|
for (@{$self->word_indices}) {
|
|
|
|
$str .= "\tIndex: $_: " . $self->words->[$_]
|
|
|
|
. " ,weight=" . $idx_weights[$i++] . "\n";
|
2009-10-28 09:02:56 -07:00
|
|
|
}
|
2009-11-11 16:21:35 -08:00
|
|
|
$self->log("index weightings:\n" . $str);
|
|
|
|
|
|
|
|
my ($n, $p, $x) = setup_walker_rand(\@idx_weights);
|
|
|
|
return ($n, $p, $x)
|
2009-10-28 09:02:56 -07:00
|
|
|
}
|
|
|
|
|
2009-11-11 16:21:35 -08:00
|
|
|
sub _sq_weight_indices {
|
|
|
|
my $max = shift;
|
|
|
|
return map { $max-- ** 2 } (0..$max-1);
|
|
|
|
}
|
2009-10-28 09:02:56 -07:00
|
|
|
|
|
|
|
|
2009-11-11 16:21:35 -08:00
|
|
|
# stealed from http://code.activestate.com/recipes/576564/
|
|
|
|
# and http://prxq.wordpress.com/2006/04/17/the-alias-method/
|
|
|
|
# Copyright someone maybe somewhere?
|
|
|
|
sub setup_walker_rand {
|
|
|
|
my ($weight_ref) = @_;
|
|
|
|
|
|
|
|
my @weights = @$weight_ref;
|
|
|
|
my $n = scalar @weights;
|
|
|
|
my @in_x = (-1) x $n;
|
|
|
|
my $sum_w = 0;
|
|
|
|
$sum_w += $_ for @weights;
|
|
|
|
|
|
|
|
# normalise weights to have an average value of 1.
|
|
|
|
@weights = map { $_ * $n / $sum_w } @weights;
|
|
|
|
|
|
|
|
my (@short, @long);
|
|
|
|
my $i = 0;
|
|
|
|
|
|
|
|
# split into long and short groups (excluding those which == 1)
|
|
|
|
for my $p (@weights) {
|
|
|
|
if ($p < 1) {
|
|
|
|
push @short, $i;
|
|
|
|
} elsif ($p > 1) {
|
|
|
|
push @long, $i;
|
|
|
|
}
|
|
|
|
$i++;
|
2009-10-28 09:02:56 -07:00
|
|
|
}
|
2009-11-11 16:21:35 -08:00
|
|
|
|
|
|
|
# build alias map by combining short and long elements.
|
|
|
|
while (scalar @short and scalar @long) {
|
|
|
|
my $j = pop @short;
|
|
|
|
my $k = $long[-1];
|
|
|
|
|
|
|
|
$in_x[$j] = $k;
|
|
|
|
$weights[$k] -= (1 - $weights[$j]);
|
|
|
|
|
|
|
|
if ($weights[$k] < 1) {
|
|
|
|
push @short, $k;
|
|
|
|
pop @long;
|
|
|
|
}
|
|
|
|
# printf("test: j=%d k=%d pk=%.2f\n", $j, $k, $prob[$k]);
|
|
|
|
}
|
|
|
|
return ($n, \@weights, \@in_x)
|
2009-10-28 09:02:56 -07:00
|
|
|
}
|
|
|
|
|
2009-11-11 16:21:35 -08:00
|
|
|
sub get_walker_rand {
|
|
|
|
my ($n, $prob, $in_x) = @_;
|
|
|
|
my ($u, $j);
|
|
|
|
$u = random_uniform(1,0,1);
|
|
|
|
$j = random_uniform_integer(1, 0, $n-1);
|
|
|
|
return ($u <= $prob->[$j]) ? $j : $in_x->[$j];
|
|
|
|
}
|
2009-10-28 09:02:56 -07:00
|
|
|
|
2009-11-11 16:21:35 -08:00
|
|
|
sub log {
|
|
|
|
my ($self, @msg) = @_;
|
|
|
|
if ($self->debug) {
|
|
|
|
print STDERR join(" ", @msg) . $/;
|
|
|
|
}
|
2009-10-28 09:02:56 -07:00
|
|
|
}
|
2007-12-10 20:29:18 -08:00
|
|
|
|
2009-11-11 16:21:35 -08:00
|
|
|
no Moose;
|
|
|
|
__PACKAGE__->meta->make_immutable;
|
|
|
|
|
|
|
|
}
|