339 lines
8.4 KiB
Perl
339 lines
8.4 KiB
Perl
package Butts;
|
|
|
|
use strict;
|
|
use warnings;
|
|
|
|
use Math::Random;
|
|
use TeX::Hyphen;
|
|
use Data::Dumper;
|
|
use Carp;
|
|
|
|
use fields qw/replace_freq
|
|
meme
|
|
hyphen_file
|
|
stopwords_file
|
|
hyphenator
|
|
stopwords
|
|
debug
|
|
_words
|
|
_word_indices/;
|
|
|
|
sub new {
|
|
my Butts $self = shift;
|
|
unless (ref $self) {
|
|
$self = fields::new($self);
|
|
}
|
|
|
|
my %args = (hyphen_file => 'hyphen.tex',
|
|
stopwords_file => 'stopwords',
|
|
debug => 0,
|
|
meme => 'butt',
|
|
replace_freq => (1/11), # original value from tef.
|
|
@_);
|
|
|
|
foreach my $key (keys %args) {
|
|
$self->{$key} = $args{$key};
|
|
}
|
|
|
|
$self->{hyphenator} = new TeX::Hyphen((file => $self->{hyphen_file}))
|
|
or croak "Couldn't create TeX::Hyphen instance from " . $self->{hyphen_file};
|
|
|
|
my @stopwords;
|
|
if (open my $sfh, $self->{stopwords_file}) {
|
|
chomp(@stopwords = <$sfh>);
|
|
close $sfh;
|
|
} else {
|
|
carp "Couldn't read stopwords file "
|
|
. $self->{stopwords_file} . ' ' . $!;
|
|
@stopwords = qw/a an and or but it in its It's it's the of you I i/;
|
|
}
|
|
|
|
$self->{stopwords} = { map { lc($_) => 1 } @stopwords };
|
|
|
|
return $self;
|
|
}
|
|
|
|
# accessors
|
|
sub meme {
|
|
my $self = shift;
|
|
if (@_) {
|
|
$self->{meme} = $_[0];
|
|
}
|
|
return $self->{meme}
|
|
}
|
|
|
|
sub replace_freq {
|
|
my $self = shift;
|
|
if (@_) {
|
|
$self->{replace_freq} = $_[0];
|
|
}
|
|
return $self->{replace_freq}
|
|
}
|
|
|
|
# helpers
|
|
sub is_stop_word {
|
|
my ($self, $word) = @_;
|
|
return exists $self->{stopwords}->{lc($word)};
|
|
}
|
|
|
|
sub is_meme {
|
|
my ($self, $word) = @_;
|
|
return lc($word) eq lc($self->{meme});
|
|
|
|
}
|
|
|
|
sub split_preserving_whitespace {
|
|
my ($string) = @_;
|
|
|
|
my ($leading_ws, $remainder) = ($string =~ m/^(\s*)(.*)$/);
|
|
$leading_ws //= '';
|
|
|
|
my @all_split = split(/(\s+)/, $remainder);
|
|
my (@words, @ws);
|
|
foreach my $tok (@all_split) {
|
|
if ($tok =~ m/^\s+$/) {
|
|
push @ws, $tok
|
|
} else {
|
|
push @words, $tok
|
|
}
|
|
}
|
|
return ($leading_ws, \@words, \@ws);
|
|
}
|
|
|
|
|
|
sub reassemble_with_whitespace {
|
|
my ($leading, $words, $ws) = @_;
|
|
|
|
# interleave the two arrays. Words always come first, because
|
|
# any leading space is in $leading.
|
|
# http://www.perlmonks.org/?node_id=53605
|
|
|
|
# if things are different sizes we'll end up with some undefs,
|
|
# so grep them out.
|
|
my @ret = grep { defined } map { $words->[$_], $ws->[$_] }
|
|
0 .. ($#$words > $#$ws ? $#$words : $#$ws);
|
|
# and convert back to a string.
|
|
return $leading . join('', @ret);
|
|
}
|
|
|
|
sub buttify_string($_) {
|
|
my $self = shift;
|
|
# glom a string from $_ if we didn't get one passed.
|
|
my $str = (@_ ? $_[0] : $_);
|
|
chomp($str);
|
|
|
|
# FIX for http://code.google.com/p/buttbot/issues/detail?id=7
|
|
my ($leading, $words, $whitespace)
|
|
= split_preserving_whitespace($str);
|
|
|
|
my @butted_words = $self->buttify(@$words);
|
|
|
|
return reassemble_with_whitespace($leading, \@butted_words, $whitespace);
|
|
}
|
|
|
|
sub buttify {
|
|
my ($self, @words) = @_;
|
|
my $how_many_butts = int(@words * $self->{replace_freq}) + 1;
|
|
my $debug = $self->{debug};
|
|
|
|
$self->{_words} = \@words;
|
|
# sort indices by word length
|
|
my @word_idxs_len_sorted = do {
|
|
my $c;
|
|
|
|
map { $_->[0] }
|
|
sort { $b->[1] <=> $a->[1] }
|
|
map { [$c++ , length($_) ] } @words;
|
|
};
|
|
|
|
# remove stop words
|
|
@word_idxs_len_sorted = grep {
|
|
my $word = $words[$_];
|
|
|
|
my $is_word = $word !~ /^[\d\W+]+$/;
|
|
my $is_stop = $self->is_stop_word($word);
|
|
my $is_meme = $self->is_meme($word);
|
|
|
|
$is_word and not $is_stop and not $is_meme;
|
|
} @word_idxs_len_sorted;
|
|
|
|
$self->{_word_indices} = \@word_idxs_len_sorted;
|
|
|
|
# bail out if we've got nothing left. This happens
|
|
# when a string is comprised entirely of stop-words.
|
|
unless (@word_idxs_len_sorted) {
|
|
$self->log("Couldn't buttify ", join(' ', @words),
|
|
": entirely stopwords");
|
|
return @words;
|
|
}
|
|
|
|
# make sure we're not trying to butt too hard.
|
|
if ($how_many_butts > @word_idxs_len_sorted) {
|
|
$how_many_butts = scalar(@word_idxs_len_sorted);
|
|
}
|
|
|
|
$self->log("buttifying with $how_many_butts repetitions");
|
|
my $words_butted = {};
|
|
my @initial_weights = _sq_weight_indices(scalar @word_idxs_len_sorted);
|
|
|
|
# Selecting words to butt works in the following way:
|
|
|
|
# * each (non-stop) word-index is assigned a weighting based on it's
|
|
# ordinal when sorted by (word) length. So, the longest word has weight =
|
|
# num_words ** 2, second longest is (num_words-1)**2, ...
|
|
|
|
# * A random distribution selects some index proportional to its weight.
|
|
# * The word at this index is butted.
|
|
# * The index is removed from consideration for subsequent buttings.
|
|
|
|
for my $c (0 .. $how_many_butts-1) {
|
|
my ($xx_n, $xx_p, $xx_x)
|
|
= $self->_build_weightings_for_index(\@initial_weights, $words_butted);
|
|
|
|
my $random_idx = get_walker_rand($xx_n, $xx_p, $xx_x);
|
|
my $idx_to_butt = $word_idxs_len_sorted[$random_idx];
|
|
|
|
$self->log("Butting word idx: $idx_to_butt [",
|
|
$words[$idx_to_butt], "]");
|
|
|
|
$words[$idx_to_butt]
|
|
= $self->_buttsub($words[$idx_to_butt]);
|
|
|
|
$words_butted->{$random_idx} = 1;
|
|
}
|
|
|
|
return @words;
|
|
}
|
|
|
|
sub _buttsub {
|
|
my ($self, $word) = @_;
|
|
|
|
my $meme = $self->{meme};
|
|
|
|
# split off leading and trailing punctuation
|
|
my ($lp, $actual_word, $rp) = $word =~ /^([^A-Za-z]*)(.*?)([^A-Za-z]*)$/;
|
|
|
|
return $word unless $actual_word;
|
|
|
|
my @points = (0, $self->{hyphenator}->hyphenate($actual_word));
|
|
|
|
my $factor = 2;
|
|
my $length = scalar @points;
|
|
my $replace = $length - 1 - int(rand($length ** $factor) ** (1 / $factor));
|
|
push @points, length($actual_word);
|
|
|
|
my $l = $points[$replace];
|
|
my $r = $points[$replace + 1] - $l;
|
|
|
|
while (substr($actual_word, $l + $r, 1) eq 't') {
|
|
$r++;
|
|
}
|
|
while ($l > 0 && substr($actual_word, $l - 1, 1) eq 'b') {
|
|
$l--;
|
|
}
|
|
my $sub = substr($actual_word, $l, $r);
|
|
my $butt = lc($meme);
|
|
|
|
if ($sub eq uc $sub) {
|
|
$butt = uc($meme);
|
|
} elsif ($sub =~/^[A-Z]/) {
|
|
$butt = ucfirst($meme);
|
|
}
|
|
|
|
substr($actual_word, $l, $r) = $butt;
|
|
return $lp . $actual_word . $rp;
|
|
}
|
|
|
|
sub _build_weightings_for_index {
|
|
my ($self, $initial_weights, $butted_indices) = @_;
|
|
|
|
#$self->log("Word indices remaining: ", @indices);
|
|
|
|
my $i = 0;
|
|
$self->log(Dumper($butted_indices));
|
|
$self->log(Dumper($initial_weights));
|
|
|
|
my @idx_weights = map {
|
|
exists($butted_indices->{$i++})?0:$_
|
|
} @$initial_weights;
|
|
|
|
my $str;
|
|
$i = 0;
|
|
for (@{$self->{_word_indices}}) {
|
|
$str .= "\tIndex: $_: " . $self->{_words}->[$_]
|
|
. " ,weight=" . $idx_weights[$i++] . "\n";
|
|
}
|
|
$self->log("index weightings:\n" . $str);
|
|
|
|
my ($n, $p, $x) = setup_walker_rand(\@idx_weights);
|
|
return ($n, $p, $x)
|
|
}
|
|
|
|
sub _sq_weight_indices {
|
|
my $max = shift;
|
|
return map { $max-- ** 2 } (0..$max-1);
|
|
}
|
|
|
|
|
|
# stealed frm http://code.activestate.com/recipes/576564/
|
|
# and http://prxq.wordpress.com/2006/04/17/the-alias-method/
|
|
# Copyright someone maybe somewhere?
|
|
sub setup_walker_rand {
|
|
my ($weight_ref) = @_;
|
|
|
|
my @weights = @$weight_ref;
|
|
my $n = scalar @weights;
|
|
my @in_x = (-1) x $n;
|
|
my $sum_w = 0;
|
|
$sum_w += $_ for @weights;
|
|
|
|
# normalise weights to have an average value of 1.
|
|
@weights = map { $_ * $n / $sum_w } @weights;
|
|
|
|
my (@short, @long);
|
|
my $i = 0;
|
|
|
|
# split into long and short groups (excluding those which == 1)
|
|
for my $p (@weights) {
|
|
if ($p < 1) {
|
|
push @short, $i;
|
|
} elsif ($p > 1) {
|
|
push @long, $i;
|
|
}
|
|
$i++;
|
|
}
|
|
|
|
# build alias map by combining short and long elements.
|
|
while (scalar @short and scalar @long) {
|
|
my $j = pop @short;
|
|
my $k = $long[-1];
|
|
|
|
$in_x[$j] = $k;
|
|
$weights[$k] -= (1 - $weights[$j]);
|
|
|
|
if ($weights[$k] < 1) {
|
|
push @short, $k;
|
|
pop @long;
|
|
}
|
|
# printf("test: j=%d k=%d pk=%.2f\n", $j, $k, $prob[$k]);
|
|
}
|
|
return ($n, \@weights, \@in_x)
|
|
}
|
|
|
|
sub get_walker_rand {
|
|
my ($n, $prob, $in_x) = @_;
|
|
my ($u, $j);
|
|
$u = random_uniform(1,0,1);
|
|
$j = random_uniform_integer(1, 0, $n-1);
|
|
return ($u <= $prob->[$j]) ? $j : $in_x->[$j];
|
|
}
|
|
|
|
sub log {
|
|
my ($self, @msg) = @_;
|
|
if ($self->{debug}) {
|
|
print STDERR join(" ", @msg) . $/;
|
|
}
|
|
}
|
|
|
|
1;
|