moved butts2 to butts, and fixed everything. Also some more tests

This commit is contained in:
shabble 2009-10-28 16:02:56 +00:00
parent bd3c7f433f
commit 83c1f1404e
6 changed files with 236 additions and 75 deletions

240
Butts.pm
View File

@ -3,84 +3,159 @@ package Butts;
use strict;
use warnings;
use Exporter;
use List::Util qw(max shuffle);
use Math::Random;
use TeX::Hyphen;
use Carp;
use constant {
DEBUG => 0,
STOPWORDS => 'stopwords',
HYPHEN => 'hyphen.tex',
};
use fields qw/replace_freq
meme
hyphen_file
stopwords_file
hyphenator
stopwords
debug/;
our @ISA = qw(Exporter);
our @EXPORT_OK = qw(buttify);
sub new {
my Butts $self = shift;
unless (ref $self) {
$self = fields::new($self);
}
my $hyp = new TeX::Hyphen( -e HYPHEN ? (file => HYPHEN) : () );
my %args = (hyphen_file => 'hyphen.tex',
stopwords_file => 'stopwords',
debug => 0,
meme => 'butt',
replace_freq => (1/11), # original value from tef.
@_);
my @stopwords;
if (-f STOPWORDS && -r STOPWORDS) {
open my($fh), STOPWORDS;
chomp(@stopwords = <$fh>);
} else {
@stopwords = qw/a an and or but it in its It's it's the of you I i/;
foreach my $key (keys %args) {
$self->{$key} = $args{$key};
}
$self->{hyphenator} = new TeX::Hyphen((file => $self->{hyphen_file}))
or croak "Couldn't create TeX::Hyphen instance from " . $self->{hyphen_file};
my @stopwords;
if (open my $sfh, $self->{stopwords_file}) {
chomp(@stopwords = <$sfh>);
close $sfh;
} else {
carp "Couldn't read stopwords file "
. $self->{stopwords_file} . ' ' . $!;
@stopwords = qw/a an and or but it in its It's it's the of you I i/;
}
$self->{stopwords} = { map { lc($_) => 1 } @stopwords };
return $self;
}
# accessor for meme
sub meme {
my $self = shift;
if (@_) {
$self->{meme} = $_[0];
}
return $self->{meme}
}
sub is_stop_word {
my ($self, $word) = @_;
return exists $self->{stopwords}->{lc($word)};
}
sub is_meme {
my ($self, $word) = @_;
return lc($word) eq lc($self->{meme});
}
sub buttify_string($_) {
my $self = shift;
# glom a string from $_ if we didn't get one passed.
my $str = (@_ ? $_[0] : $_);
chomp($str);
my @words = split /\s+/, $str;
return join(" ", $self->buttify(@words));
}
sub buttify {
my $meme = shift;
my @words = @_;
my $repetitions = int(@words / 11) + 1;
my $c = 0;
my ($self, @words) = @_;
my $how_many_butts = int(@words * $self->{replace_freq}) + 1;
my $debug = $self->{debug};
# sort indicies by word length
my @longest = do {
# sort indices by word length
my @word_idxs_len_sorted = do {
my $c;
map { $_->[0] }
sort { $b->[1] <=> $a->[1] }
map { [$c++ , length($_) ] } @words;
sort { $b->[1] <=> $a->[1] }
map { [$c++ , length($_) ] } @words;
};
# remove stop words
@longest = grep {
my $word = $words[$_];
@word_idxs_len_sorted = grep {
my $word = $words[$_];
my $is_word = $word !~ /^[\d\W+]+$/;
my $is_stop = grep /\Q$word\E/i, @stopwords;
my $is_meme = $word =~ /\Q$meme\E/i;
my $is_stop = $self->is_stop_word($word);
my $is_meme = $self->is_meme($word);
$is_word and not $is_stop and not $is_meme;
} @longest;
} @word_idxs_len_sorted;
print 'Words in order: ' . join(', ', map { $words[$_] } @longest) . "\n" if DEBUG;
# bail out if we've got nothing left. This happens
# when a string is comprised entirely of stop-words.
unless (@word_idxs_len_sorted) {
$self->log("Couldn't buttify ", join(' ', @words),
": entirely stopwords");
return @words;
}
# create weighted index array of words by length
my @indices = map { $longest[$_] } _weighted_indices(scalar @longest);
$self->log("Words remaining: ",
@word_idxs_len_sorted);
print 'Weighted words in order: ' . join(', ', map { $words[$_] } @indices) . "\n" if DEBUG;
$self->log('Words in length order: '
. join(', ', map { $words[$_] } @word_idxs_len_sorted));
@indices = shuffle(@indices) if @indices;
my @idx_weights = _sq_weight_indices (scalar @word_idxs_len_sorted);
$self->log('index1 weightings: ',
join(", ", @idx_weights));
for my $c (0 .. $repetitions - 1) {
my $index = $indices[$c];
next if (!defined $index);
$words[$index] = _buttsub($meme, $words[$index]);
@indices = grep { $_ != $index } @indices;
my ($xx_n, $xx_p, $xx_x) = setup_walker_rand(\@idx_weights);
# keep track of which we've done already so we can pick another.
# there's probably a better way of doing this.
my $words_butted = {};
$self->log("buttifying with $how_many_butts repetitions");
for my $c (0 .. $how_many_butts-1) {
# Boooooooooogocheck. We really need non-replacement picks.
my $idx_to_butt;
do {
my $random_idx = get_walker_rand($xx_n, $xx_p, $xx_x);
$idx_to_butt = $word_idxs_len_sorted[$random_idx];
} until not exists($words_butted->{$idx_to_butt});
$self->log("Butting word idx: $idx_to_butt [", $words[$idx_to_butt], "]");
$words[$idx_to_butt] = $self->_buttsub($words[$idx_to_butt]);
$words_butted->{$idx_to_butt} = 1;
}
return @words;
}
sub _buttsub {
my $meme = shift;
my $word = shift;
my ($self, $word) = @_;
my $meme = $self->{meme};
# split off leading and trailing punctuation
my ($lp, $actual_word, $rp) = $word =~ /^([^A-Za-z]*)(.*?)([^A-Za-z]*)$/;
return $word unless $actual_word;
my @points = (0, $hyp->hyphenate($actual_word));
my @points = (0, $self->{hyphenator}->hyphenate($actual_word));
my $factor = 2;
my $length = scalar @points;
@ -90,8 +165,12 @@ sub _buttsub {
my $l = $points[$replace];
my $r = $points[$replace + 1] - $l;
while (substr($actual_word, $l + $r, 1) eq 't') { $r++ }
while ($l > 0 && substr($actual_word, $l - 1, 1) eq 'b') { $l-- }
while (substr($actual_word, $l + $r, 1) eq 't') {
$r++;
}
while ($l > 0 && substr($actual_word, $l - 1, 1) eq 'b') {
$l--;
}
my $sub = substr($actual_word, $l, $r);
my $butt = lc($meme);
@ -99,23 +178,76 @@ sub _buttsub {
$butt = uc($meme);
} elsif ($sub =~/^[A-Z]/) {
$butt = ucfirst($meme);
}
}
substr($actual_word, $l, $r) = $butt;
return "$lp$actual_word$rp";
}
sub _weighted_indices {
my $length = shift;
my $weight = $length;
sub _sq_weight_indices {
my $max = shift;
return map { $max-- ** 2 } (0..$max-1);
}
my @stack;
for my $index (0 .. $length - 1) {
push @stack, ($index) x ($weight ** 2);
$weight--;
}
return @stack;
# stealed frm http://code.activestate.com/recipes/576564/
# and http://prxq.wordpress.com/2006/04/17/the-alias-method/
# Copyright someone maybe somewhere?
sub setup_walker_rand {
my ($weight_ref) = @_;
my @weights = @$weight_ref;
my $n = scalar @weights;
my @in_x = (-1) x $n;
my $sum_w = 0;
$sum_w += $_ for @weights;
# normalise weights to have an average value of 1.
@weights = map { $_ * $n / $sum_w } @weights;
my (@short, @long);
my $i = 0;
# split into long and short groups (excluding those which == 1)
for my $p (@weights) {
if ($p < 1) {
push @short, $i;
} elsif ($p > 1) {
push @long, $i;
}
$i++;
}
# build alias map by combining short and long elements.
while (scalar @short and scalar @long) {
my $j = pop @short;
my $k = $long[-1];
$in_x[$j] = $k;
$weights[$k] -= (1 - $weights[$j]);
if ($weights[$k] < 1) {
push @short, $k;
pop @long;
}
# printf("test: j=%d k=%d pk=%.2f\n", $j, $k, $prob[$k]);
}
return ($n, \@weights, \@in_x)
}
sub get_walker_rand {
my ($n, $prob, $in_x) = @_;
my ($u, $j);
$u = random_uniform(1,0,1);
$j = random_uniform_integer(1, 0, $n-1);
return ($u <= $prob->[$j]) ? $j : $in_x->[$j];
}
sub log {
my ($self, @msg) = @_;
if ($self->{debug}) {
print STDERR join(" ", @msg) . $/;
}
}
1;

View File

@ -1,4 +1,4 @@
package Butts2;
package Butts;
use strict;
use warnings;
@ -16,7 +16,7 @@ use fields qw/replace_freq
debug/;
sub new {
my Butts2 $self = shift;
my Butts $self = shift;
unless (ref $self) {
$self = fields::new($self);
}

View File

@ -2,12 +2,21 @@
use strict;
use warnings;
use Butts qw(buttify);
use Butts;
my $butt = shift;
$butt = $butt || "butt";
my $butt = $ARGV[0] || "butt";
my $buttifier = Butts->new(meme => $butt, debug => 1,
replace_freq => $ARGV[1] // 0.5);
while(<>) {
chomp;
print join(" ", buttify($butt,split(/\s+/, $_)))."\n";
print "butt repeat rate is " . $buttifier->{replace_freq} . $/;
while(<STDIN>) {
# The old way
# chomp
# print join(" ", $buttifier->buttify(split(/\s+/, $_)))."\n";
# The new way
print $buttifier->buttify_string, $/;
}

View File

@ -2,10 +2,10 @@
use strict;
use warnings;
use Butts2;
use Butts;
my $butt = $ARGV[0] || "butt";
my $buttifier = Butts2->new(meme => $butt, debug => 1,
my $buttifier = Butts->new(meme => $butt, debug => 1,
replace_freq => $ARGV[1] || 0.5);
while(<STDIN>) {

View File

@ -23,9 +23,8 @@ sub on_privmsg {
return 0 unless (rand(Irssi::settings_get_int("butt_frequency")) < 1);
return 0 if ($text =~ /^!|^http:\/\/\S+$|butt|^\W+$/i);
my @words = split(/\s+/, $text);
my @replaced_words = Butts::buttify("butt", @words);
my $replaced_text = join(" ", @replaced_words);
my $butter = Butts->new(meme => 'butt');
my $replaced_text = $butter->buttify_string($text);
unless ($text eq $replaced_text) {
Irssi::timeout_add_once(rand(8000) + 1000,

View File

@ -1,20 +1,41 @@
use strict;
use warnings;
use Test::More tests => 12;
use Test::More tests => 25;
BEGIN { use_ok('Butts', qw(buttify)); }
BEGIN { use_ok('Butts'); }
can_ok('Butts', qw(buttify));
my $meme = "butt";
my $butter = Butts->new(meme => $meme);
isa_ok($butter, 'Butts', 'butter Object');
can_ok($butter, qw(buttify buttify_string meme));
is($butter->meme, $meme, 'fetching meme');
my $meme2 = "bacon";
is($butter->meme($meme2), $meme2, 'changing meme');
# set it back
$butter->meme($meme);
my @buttify_data = qw(The rain in Spain falls mainly on the plain.);
for (1 .. 10) {
my @sample = buttify("butt", qw(The rain in Spain falls mainly on the plain.));
my @output = $butter->buttify(@buttify_data);
{
local $" = ' ';
print "@sample\n";
print "@output\n";
}
my $has_butt = grep { /butt/i } @sample;
ok($has_butt, 'sample has butt');
my $has_butt = grep { /\Q$meme\E/i } @output;
ok($has_butt, 'buttify array has butt');
}
my $buttify_str_sample = "An idle hand is worth two in the bush\n";
for (1 .. 10) {
my $output = $butter->buttify_string($buttify_str_sample);
print $output, $/;
like( $output, qr/\Q$meme\E/, 'buttify_string has butt');
}