moved butts2 to butts, and fixed everything. Also some more tests
This commit is contained in:
parent
bd3c7f433f
commit
83c1f1404e
240
Butts.pm
240
Butts.pm
|
@ -3,84 +3,159 @@ package Butts;
|
|||
use strict;
|
||||
use warnings;
|
||||
|
||||
use Exporter;
|
||||
use List::Util qw(max shuffle);
|
||||
use Math::Random;
|
||||
use TeX::Hyphen;
|
||||
use Carp;
|
||||
|
||||
use constant {
|
||||
DEBUG => 0,
|
||||
STOPWORDS => 'stopwords',
|
||||
HYPHEN => 'hyphen.tex',
|
||||
};
|
||||
use fields qw/replace_freq
|
||||
meme
|
||||
hyphen_file
|
||||
stopwords_file
|
||||
hyphenator
|
||||
stopwords
|
||||
debug/;
|
||||
|
||||
our @ISA = qw(Exporter);
|
||||
our @EXPORT_OK = qw(buttify);
|
||||
sub new {
|
||||
my Butts $self = shift;
|
||||
unless (ref $self) {
|
||||
$self = fields::new($self);
|
||||
}
|
||||
|
||||
my $hyp = new TeX::Hyphen( -e HYPHEN ? (file => HYPHEN) : () );
|
||||
my %args = (hyphen_file => 'hyphen.tex',
|
||||
stopwords_file => 'stopwords',
|
||||
debug => 0,
|
||||
meme => 'butt',
|
||||
replace_freq => (1/11), # original value from tef.
|
||||
@_);
|
||||
|
||||
my @stopwords;
|
||||
if (-f STOPWORDS && -r STOPWORDS) {
|
||||
open my($fh), STOPWORDS;
|
||||
chomp(@stopwords = <$fh>);
|
||||
} else {
|
||||
@stopwords = qw/a an and or but it in its It's it's the of you I i/;
|
||||
foreach my $key (keys %args) {
|
||||
$self->{$key} = $args{$key};
|
||||
}
|
||||
|
||||
$self->{hyphenator} = new TeX::Hyphen((file => $self->{hyphen_file}))
|
||||
or croak "Couldn't create TeX::Hyphen instance from " . $self->{hyphen_file};
|
||||
|
||||
my @stopwords;
|
||||
if (open my $sfh, $self->{stopwords_file}) {
|
||||
chomp(@stopwords = <$sfh>);
|
||||
close $sfh;
|
||||
} else {
|
||||
carp "Couldn't read stopwords file "
|
||||
. $self->{stopwords_file} . ' ' . $!;
|
||||
@stopwords = qw/a an and or but it in its It's it's the of you I i/;
|
||||
}
|
||||
|
||||
$self->{stopwords} = { map { lc($_) => 1 } @stopwords };
|
||||
|
||||
return $self;
|
||||
}
|
||||
|
||||
# accessor for meme
|
||||
sub meme {
|
||||
my $self = shift;
|
||||
if (@_) {
|
||||
$self->{meme} = $_[0];
|
||||
}
|
||||
return $self->{meme}
|
||||
}
|
||||
|
||||
sub is_stop_word {
|
||||
my ($self, $word) = @_;
|
||||
return exists $self->{stopwords}->{lc($word)};
|
||||
}
|
||||
|
||||
sub is_meme {
|
||||
my ($self, $word) = @_;
|
||||
return lc($word) eq lc($self->{meme});
|
||||
|
||||
}
|
||||
|
||||
sub buttify_string($_) {
|
||||
my $self = shift;
|
||||
# glom a string from $_ if we didn't get one passed.
|
||||
my $str = (@_ ? $_[0] : $_);
|
||||
chomp($str);
|
||||
my @words = split /\s+/, $str;
|
||||
return join(" ", $self->buttify(@words));
|
||||
}
|
||||
|
||||
sub buttify {
|
||||
my $meme = shift;
|
||||
my @words = @_;
|
||||
my $repetitions = int(@words / 11) + 1;
|
||||
my $c = 0;
|
||||
my ($self, @words) = @_;
|
||||
my $how_many_butts = int(@words * $self->{replace_freq}) + 1;
|
||||
my $debug = $self->{debug};
|
||||
|
||||
# sort indicies by word length
|
||||
my @longest = do {
|
||||
# sort indices by word length
|
||||
my @word_idxs_len_sorted = do {
|
||||
my $c;
|
||||
|
||||
map { $_->[0] }
|
||||
sort { $b->[1] <=> $a->[1] }
|
||||
map { [$c++ , length($_) ] } @words;
|
||||
sort { $b->[1] <=> $a->[1] }
|
||||
map { [$c++ , length($_) ] } @words;
|
||||
};
|
||||
|
||||
# remove stop words
|
||||
@longest = grep {
|
||||
my $word = $words[$_];
|
||||
@word_idxs_len_sorted = grep {
|
||||
my $word = $words[$_];
|
||||
|
||||
my $is_word = $word !~ /^[\d\W+]+$/;
|
||||
my $is_stop = grep /\Q$word\E/i, @stopwords;
|
||||
my $is_meme = $word =~ /\Q$meme\E/i;
|
||||
my $is_stop = $self->is_stop_word($word);
|
||||
my $is_meme = $self->is_meme($word);
|
||||
|
||||
$is_word and not $is_stop and not $is_meme;
|
||||
} @longest;
|
||||
} @word_idxs_len_sorted;
|
||||
|
||||
print 'Words in order: ' . join(', ', map { $words[$_] } @longest) . "\n" if DEBUG;
|
||||
# bail out if we've got nothing left. This happens
|
||||
# when a string is comprised entirely of stop-words.
|
||||
unless (@word_idxs_len_sorted) {
|
||||
$self->log("Couldn't buttify ", join(' ', @words),
|
||||
": entirely stopwords");
|
||||
return @words;
|
||||
}
|
||||
|
||||
# create weighted index array of words by length
|
||||
my @indices = map { $longest[$_] } _weighted_indices(scalar @longest);
|
||||
$self->log("Words remaining: ",
|
||||
@word_idxs_len_sorted);
|
||||
|
||||
print 'Weighted words in order: ' . join(', ', map { $words[$_] } @indices) . "\n" if DEBUG;
|
||||
$self->log('Words in length order: '
|
||||
. join(', ', map { $words[$_] } @word_idxs_len_sorted));
|
||||
|
||||
@indices = shuffle(@indices) if @indices;
|
||||
my @idx_weights = _sq_weight_indices (scalar @word_idxs_len_sorted);
|
||||
$self->log('index1 weightings: ',
|
||||
join(", ", @idx_weights));
|
||||
|
||||
for my $c (0 .. $repetitions - 1) {
|
||||
my $index = $indices[$c];
|
||||
next if (!defined $index);
|
||||
$words[$index] = _buttsub($meme, $words[$index]);
|
||||
@indices = grep { $_ != $index } @indices;
|
||||
my ($xx_n, $xx_p, $xx_x) = setup_walker_rand(\@idx_weights);
|
||||
|
||||
# keep track of which we've done already so we can pick another.
|
||||
# there's probably a better way of doing this.
|
||||
my $words_butted = {};
|
||||
$self->log("buttifying with $how_many_butts repetitions");
|
||||
for my $c (0 .. $how_many_butts-1) {
|
||||
|
||||
# Boooooooooogocheck. We really need non-replacement picks.
|
||||
my $idx_to_butt;
|
||||
do {
|
||||
my $random_idx = get_walker_rand($xx_n, $xx_p, $xx_x);
|
||||
$idx_to_butt = $word_idxs_len_sorted[$random_idx];
|
||||
} until not exists($words_butted->{$idx_to_butt});
|
||||
|
||||
$self->log("Butting word idx: $idx_to_butt [", $words[$idx_to_butt], "]");
|
||||
$words[$idx_to_butt] = $self->_buttsub($words[$idx_to_butt]);
|
||||
$words_butted->{$idx_to_butt} = 1;
|
||||
}
|
||||
|
||||
return @words;
|
||||
}
|
||||
|
||||
sub _buttsub {
|
||||
my $meme = shift;
|
||||
my $word = shift;
|
||||
my ($self, $word) = @_;
|
||||
|
||||
my $meme = $self->{meme};
|
||||
|
||||
# split off leading and trailing punctuation
|
||||
my ($lp, $actual_word, $rp) = $word =~ /^([^A-Za-z]*)(.*?)([^A-Za-z]*)$/;
|
||||
|
||||
return $word unless $actual_word;
|
||||
|
||||
my @points = (0, $hyp->hyphenate($actual_word));
|
||||
my @points = (0, $self->{hyphenator}->hyphenate($actual_word));
|
||||
|
||||
my $factor = 2;
|
||||
my $length = scalar @points;
|
||||
|
@ -90,8 +165,12 @@ sub _buttsub {
|
|||
my $l = $points[$replace];
|
||||
my $r = $points[$replace + 1] - $l;
|
||||
|
||||
while (substr($actual_word, $l + $r, 1) eq 't') { $r++ }
|
||||
while ($l > 0 && substr($actual_word, $l - 1, 1) eq 'b') { $l-- }
|
||||
while (substr($actual_word, $l + $r, 1) eq 't') {
|
||||
$r++;
|
||||
}
|
||||
while ($l > 0 && substr($actual_word, $l - 1, 1) eq 'b') {
|
||||
$l--;
|
||||
}
|
||||
my $sub = substr($actual_word, $l, $r);
|
||||
my $butt = lc($meme);
|
||||
|
||||
|
@ -99,23 +178,76 @@ sub _buttsub {
|
|||
$butt = uc($meme);
|
||||
} elsif ($sub =~/^[A-Z]/) {
|
||||
$butt = ucfirst($meme);
|
||||
}
|
||||
}
|
||||
|
||||
substr($actual_word, $l, $r) = $butt;
|
||||
return "$lp$actual_word$rp";
|
||||
}
|
||||
|
||||
sub _weighted_indices {
|
||||
my $length = shift;
|
||||
my $weight = $length;
|
||||
sub _sq_weight_indices {
|
||||
my $max = shift;
|
||||
return map { $max-- ** 2 } (0..$max-1);
|
||||
}
|
||||
|
||||
my @stack;
|
||||
for my $index (0 .. $length - 1) {
|
||||
push @stack, ($index) x ($weight ** 2);
|
||||
$weight--;
|
||||
}
|
||||
|
||||
return @stack;
|
||||
# stealed frm http://code.activestate.com/recipes/576564/
|
||||
# and http://prxq.wordpress.com/2006/04/17/the-alias-method/
|
||||
# Copyright someone maybe somewhere?
|
||||
sub setup_walker_rand {
|
||||
my ($weight_ref) = @_;
|
||||
|
||||
my @weights = @$weight_ref;
|
||||
my $n = scalar @weights;
|
||||
my @in_x = (-1) x $n;
|
||||
my $sum_w = 0;
|
||||
$sum_w += $_ for @weights;
|
||||
|
||||
# normalise weights to have an average value of 1.
|
||||
@weights = map { $_ * $n / $sum_w } @weights;
|
||||
|
||||
my (@short, @long);
|
||||
my $i = 0;
|
||||
|
||||
# split into long and short groups (excluding those which == 1)
|
||||
for my $p (@weights) {
|
||||
if ($p < 1) {
|
||||
push @short, $i;
|
||||
} elsif ($p > 1) {
|
||||
push @long, $i;
|
||||
}
|
||||
$i++;
|
||||
}
|
||||
|
||||
# build alias map by combining short and long elements.
|
||||
while (scalar @short and scalar @long) {
|
||||
my $j = pop @short;
|
||||
my $k = $long[-1];
|
||||
|
||||
$in_x[$j] = $k;
|
||||
$weights[$k] -= (1 - $weights[$j]);
|
||||
|
||||
if ($weights[$k] < 1) {
|
||||
push @short, $k;
|
||||
pop @long;
|
||||
}
|
||||
# printf("test: j=%d k=%d pk=%.2f\n", $j, $k, $prob[$k]);
|
||||
}
|
||||
return ($n, \@weights, \@in_x)
|
||||
}
|
||||
|
||||
sub get_walker_rand {
|
||||
my ($n, $prob, $in_x) = @_;
|
||||
my ($u, $j);
|
||||
$u = random_uniform(1,0,1);
|
||||
$j = random_uniform_integer(1, 0, $n-1);
|
||||
return ($u <= $prob->[$j]) ? $j : $in_x->[$j];
|
||||
}
|
||||
|
||||
sub log {
|
||||
my ($self, @msg) = @_;
|
||||
if ($self->{debug}) {
|
||||
print STDERR join(" ", @msg) . $/;
|
||||
}
|
||||
}
|
||||
|
||||
1;
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
package Butts2;
|
||||
package Butts;
|
||||
|
||||
use strict;
|
||||
use warnings;
|
||||
|
@ -16,7 +16,7 @@ use fields qw/replace_freq
|
|||
debug/;
|
||||
|
||||
sub new {
|
||||
my Butts2 $self = shift;
|
||||
my Butts $self = shift;
|
||||
unless (ref $self) {
|
||||
$self = fields::new($self);
|
||||
}
|
||||
|
|
21
buttest.pl
21
buttest.pl
|
@ -2,12 +2,21 @@
|
|||
use strict;
|
||||
use warnings;
|
||||
|
||||
use Butts qw(buttify);
|
||||
use Butts;
|
||||
|
||||
my $butt = shift;
|
||||
$butt = $butt || "butt";
|
||||
my $butt = $ARGV[0] || "butt";
|
||||
my $buttifier = Butts->new(meme => $butt, debug => 1,
|
||||
replace_freq => $ARGV[1] // 0.5);
|
||||
|
||||
while(<>) {
|
||||
chomp;
|
||||
print join(" ", buttify($butt,split(/\s+/, $_)))."\n";
|
||||
print "butt repeat rate is " . $buttifier->{replace_freq} . $/;
|
||||
while(<STDIN>) {
|
||||
|
||||
# The old way
|
||||
|
||||
# chomp
|
||||
# print join(" ", $buttifier->buttify(split(/\s+/, $_)))."\n";
|
||||
|
||||
# The new way
|
||||
|
||||
print $buttifier->buttify_string, $/;
|
||||
}
|
||||
|
|
|
@ -2,10 +2,10 @@
|
|||
use strict;
|
||||
use warnings;
|
||||
|
||||
use Butts2;
|
||||
use Butts;
|
||||
|
||||
my $butt = $ARGV[0] || "butt";
|
||||
my $buttifier = Butts2->new(meme => $butt, debug => 1,
|
||||
my $buttifier = Butts->new(meme => $butt, debug => 1,
|
||||
replace_freq => $ARGV[1] || 0.5);
|
||||
|
||||
while(<STDIN>) {
|
||||
|
|
|
@ -23,9 +23,8 @@ sub on_privmsg {
|
|||
return 0 unless (rand(Irssi::settings_get_int("butt_frequency")) < 1);
|
||||
return 0 if ($text =~ /^!|^http:\/\/\S+$|butt|^\W+$/i);
|
||||
|
||||
my @words = split(/\s+/, $text);
|
||||
my @replaced_words = Butts::buttify("butt", @words);
|
||||
my $replaced_text = join(" ", @replaced_words);
|
||||
my $butter = Butts->new(meme => 'butt');
|
||||
my $replaced_text = $butter->buttify_string($text);
|
||||
|
||||
unless ($text eq $replaced_text) {
|
||||
Irssi::timeout_add_once(rand(8000) + 1000,
|
||||
|
|
37
t/Butts.t
37
t/Butts.t
|
@ -1,20 +1,41 @@
|
|||
use strict;
|
||||
use warnings;
|
||||
|
||||
use Test::More tests => 12;
|
||||
use Test::More tests => 25;
|
||||
|
||||
BEGIN { use_ok('Butts', qw(buttify)); }
|
||||
BEGIN { use_ok('Butts'); }
|
||||
|
||||
can_ok('Butts', qw(buttify));
|
||||
my $meme = "butt";
|
||||
my $butter = Butts->new(meme => $meme);
|
||||
|
||||
isa_ok($butter, 'Butts', 'butter Object');
|
||||
can_ok($butter, qw(buttify buttify_string meme));
|
||||
|
||||
is($butter->meme, $meme, 'fetching meme');
|
||||
|
||||
my $meme2 = "bacon";
|
||||
is($butter->meme($meme2), $meme2, 'changing meme');
|
||||
|
||||
# set it back
|
||||
$butter->meme($meme);
|
||||
|
||||
my @buttify_data = qw(The rain in Spain falls mainly on the plain.);
|
||||
|
||||
for (1 .. 10) {
|
||||
my @sample = buttify("butt", qw(The rain in Spain falls mainly on the plain.));
|
||||
my @output = $butter->buttify(@buttify_data);
|
||||
{
|
||||
local $" = ' ';
|
||||
print "@sample\n";
|
||||
print "@output\n";
|
||||
}
|
||||
|
||||
my $has_butt = grep { /butt/i } @sample;
|
||||
|
||||
ok($has_butt, 'sample has butt');
|
||||
my $has_butt = grep { /\Q$meme\E/i } @output;
|
||||
ok($has_butt, 'buttify array has butt');
|
||||
}
|
||||
|
||||
my $buttify_str_sample = "An idle hand is worth two in the bush\n";
|
||||
|
||||
for (1 .. 10) {
|
||||
my $output = $butter->buttify_string($buttify_str_sample);
|
||||
print $output, $/;
|
||||
like( $output, qr/\Q$meme\E/, 'buttify_string has butt');
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue