fixed distribution issues with random buttpicker algo in module, added whitespace preservation (with Test!), and probalby something else

This commit is contained in:
shabble 2009-10-28 21:57:59 +00:00
parent 756124c2d8
commit ab9ba900f6
4 changed files with 146 additions and 27 deletions

107
Butts.pm
View File

@ -5,6 +5,7 @@ use warnings;
use Math::Random; use Math::Random;
use TeX::Hyphen; use TeX::Hyphen;
use Data::Dumper;
use Carp; use Carp;
use fields qw/replace_freq use fields qw/replace_freq
@ -13,7 +14,9 @@ use fields qw/replace_freq
stopwords_file stopwords_file
hyphenator hyphenator
stopwords stopwords
debug/; debug
_words
_word_indices/;
sub new { sub new {
my Butts $self = shift; my Butts $self = shift;
@ -79,13 +82,53 @@ sub is_meme {
} }
sub split_preserving_whitespace {
my ($string) = @_;
my ($leading_ws, $remainder) = ($string =~ m/^(\s*)(.*)$/);
$leading_ws //= '';
my @all_split = split(/(\s+)/, $remainder);
my (@words, @ws);
foreach my $tok (@all_split) {
if ($tok =~ m/^\s+$/) {
push @ws, $tok
} else {
push @words, $tok
}
}
return ($leading_ws, \@words, \@ws);
}
sub reassemble_with_whitespace {
my ($leading, $words, $ws) = @_;
# interleave the two arrays. Words always come first, because
# any leading space is in $leading.
# http://www.perlmonks.org/?node_id=53605
# if things are different sizes we'll end up with some undefs,
# so grep them out.
my @ret = grep { defined } map { $words->[$_], $ws->[$_] }
0 .. ($#$words > $#$ws ? $#$words : $#$ws);
# and convert back to a string.
return $leading . join('', @ret);
}
sub buttify_string($_) { sub buttify_string($_) {
my $self = shift; my $self = shift;
# glom a string from $_ if we didn't get one passed. # glom a string from $_ if we didn't get one passed.
my $str = (@_ ? $_[0] : $_); my $str = (@_ ? $_[0] : $_);
chomp($str); chomp($str);
my @words = split /\s+/, $str;
return join(" ", $self->buttify(@words)); # FIX for http://code.google.com/p/buttbot/issues/detail?id=7
my ($leading, $words, $whitespace)
= split_preserving_whitespace($str);
my @butted_words = $self->buttify(@$words);
return reassemble_with_whitespace($leading, \@butted_words, $whitespace);
} }
sub buttify { sub buttify {
@ -93,6 +136,7 @@ sub buttify {
my $how_many_butts = int(@words * $self->{replace_freq}) + 1; my $how_many_butts = int(@words * $self->{replace_freq}) + 1;
my $debug = $self->{debug}; my $debug = $self->{debug};
$self->{_words} = \@words;
# sort indices by word length # sort indices by word length
my @word_idxs_len_sorted = do { my @word_idxs_len_sorted = do {
my $c; my $c;
@ -113,6 +157,8 @@ sub buttify {
$is_word and not $is_stop and not $is_meme; $is_word and not $is_stop and not $is_meme;
} @word_idxs_len_sorted; } @word_idxs_len_sorted;
$self->{_word_indices} = \@word_idxs_len_sorted;
# bail out if we've got nothing left. This happens # bail out if we've got nothing left. This happens
# when a string is comprised entirely of stop-words. # when a string is comprised entirely of stop-words.
unless (@word_idxs_len_sorted) { unless (@word_idxs_len_sorted) {
@ -128,14 +174,24 @@ sub buttify {
$self->log("buttifying with $how_many_butts repetitions"); $self->log("buttifying with $how_many_butts repetitions");
my $words_butted = {}; my $words_butted = {};
my @initial_weights = _sq_weight_indices(scalar @word_idxs_len_sorted);
my ($non_butted_indices, $xx_n, $xx_p, $xx_x) # Selecting words to butt works in the following way:
= $self->_build_weighted_index(\@word_idxs_len_sorted, $words_butted);
# * each (non-stop) word-index is assigned a weighting based on it's
# ordinal when sorted by (word) length. So, the longest word has weight =
# num_words ** 2, second longest is (num_words-1)**2, ...
# * A random distribution selects some index proportional to its weight.
# * The word at this index is butted.
# * The index is removed from consideration for subsequent buttings.
for my $c (0 .. $how_many_butts-1) { for my $c (0 .. $how_many_butts-1) {
my ($xx_n, $xx_p, $xx_x)
= $self->_build_weightings_for_index(\@initial_weights, $words_butted);
my $random_idx = get_walker_rand($xx_n, $xx_p, $xx_x); my $random_idx = get_walker_rand($xx_n, $xx_p, $xx_x);
my $idx_to_butt = $non_butted_indices->[$random_idx]; my $idx_to_butt = $word_idxs_len_sorted[$random_idx];
$self->log("Butting word idx: $idx_to_butt [", $self->log("Butting word idx: $idx_to_butt [",
$words[$idx_to_butt], "]"); $words[$idx_to_butt], "]");
@ -143,10 +199,7 @@ sub buttify {
$words[$idx_to_butt] $words[$idx_to_butt]
= $self->_buttsub($words[$idx_to_butt]); = $self->_buttsub($words[$idx_to_butt]);
$words_butted->{$idx_to_butt} = 1; $words_butted->{$random_idx} = 1;
($non_butted_indices, $xx_n, $xx_p, $xx_x)
= $self->_build_weighted_index(\@word_idxs_len_sorted, $words_butted);
} }
return @words; return @words;
@ -161,7 +214,7 @@ sub _buttsub {
my ($lp, $actual_word, $rp) = $word =~ /^([^A-Za-z]*)(.*?)([^A-Za-z]*)$/; my ($lp, $actual_word, $rp) = $word =~ /^([^A-Za-z]*)(.*?)([^A-Za-z]*)$/;
return $word unless $actual_word; return $word unless $actual_word;
my @points = (0, $self->{hyphenator}->hyphenate($actual_word)); my @points = (0, $self->{hyphenator}->hyphenate($actual_word));
my $factor = 2; my $factor = 2;
@ -171,7 +224,7 @@ sub _buttsub {
my $l = $points[$replace]; my $l = $points[$replace];
my $r = $points[$replace + 1] - $l; my $r = $points[$replace + 1] - $l;
while (substr($actual_word, $l + $r, 1) eq 't') { while (substr($actual_word, $l + $r, 1) eq 't') {
$r++; $r++;
} }
@ -186,26 +239,34 @@ sub _buttsub {
} elsif ($sub =~/^[A-Z]/) { } elsif ($sub =~/^[A-Z]/) {
$butt = ucfirst($meme); $butt = ucfirst($meme);
} }
substr($actual_word, $l, $r) = $butt; substr($actual_word, $l, $r) = $butt;
return $lp . $actual_word . $rp; return $lp . $actual_word . $rp;
} }
sub _build_weighted_index { sub _build_weightings_for_index {
my $self = shift; my ($self, $initial_weights, $butted_indices) = @_;
my (@indices) = @{ shift() },
my $butted_indices = shift;
$self->log("Word indices remaining: ", @indices); #$self->log("Word indices remaining: ", @indices);
my @non_butted_indices = grep { !exists ($butted_indices->{$_}) } @indices; my $i = 0;
my @idx_weights = _sq_weight_indices(scalar @non_butted_indices); $self->log(Dumper($butted_indices));
$self->log(Dumper($initial_weights));
$self->log('index weightings: ', my @idx_weights = map {
join(", ", @idx_weights)); exists($butted_indices->{$i++})?0:$_
} @$initial_weights;
my $str;
$i = 0;
for (@{$self->{_word_indices}}) {
$str .= "\tIndex: $_: " . $self->{_words}->[$_]
. " ,weight=" . $idx_weights[$i++] . "\n";
}
$self->log("index weightings:\n" . $str);
my ($n, $p, $x) = setup_walker_rand(\@idx_weights); my ($n, $p, $x) = setup_walker_rand(\@idx_weights);
return (\@non_butted_indices, $n, $p, $x) return ($n, $p, $x)
} }
sub _sq_weight_indices { sub _sq_weight_indices {

View File

@ -227,7 +227,7 @@ sub handle_said_emoted {
# butting is the default behaviour. # butting is the default behaviour.
$self->log("BUTT: Might butt\n"); $self->log("BUTT: Might butt\n");
if ($self->to_butt_or_not_to_butt($who)) { if ($self->to_butt_or_not_to_butt($who, $body)) {
$self->log("BUTT: Butting $who in [$channel]\n"); $self->log("BUTT: Butting $who in [$channel]\n");
$self->buttify_message($who, $channel, $body, $reply_as_emote, 0); $self->buttify_message($who, $channel, $body, $reply_as_emote, 0);
} }
@ -413,6 +413,11 @@ sub buttify_message {
my $butt_msg = $self->{butter}->buttify_string($what); my $butt_msg = $self->{butter}->buttify_string($what);
unless ($self->_was_string_butted($what, $butt_msg)) {
$self->log("BUTT: String \"$butt_msg\" wasn't butted");
return 0;
}
if ($reply_as_emote) { if ($reply_as_emote) {
$self->emote(channel => $where, who => $who, $self->emote(channel => $where, who => $who,
body => $butt_msg, address => 0); body => $butt_msg, address => 0);
@ -420,16 +425,22 @@ sub buttify_message {
$self->say(channel => $where, who => $who, $self->say(channel => $where, who => $who,
body => $butt_msg, address => $prefix_addressee); body => $butt_msg, address => $prefix_addressee);
} }
1; return;
} }
sub to_butt_or_not_to_butt { sub to_butt_or_not_to_butt {
my ($self, $sufferer) = @_; my ($self, $sufferer, $message) = @_;
my $rnd_max = 0; my $rnd_max = 0;
my $frequencies = $self->config('frequency'); my $frequencies = $self->config('frequency');
return 0 if $self->might_be_a_bot($sufferer); return 0 if $self->might_be_a_bot($sufferer);
# Fixes issue 6.
unless ($self->_is_string_buttable($message)) {
$self->log("BUTT: String is not buttable");
return 0;
}
if ($self->is_enemy($sufferer)) { if ($self->is_enemy($sufferer)) {
$self->log("BUTT: [$sufferer:enemy] not butting\n"); $self->log("BUTT: [$sufferer:enemy] not butting\n");
return 0; return 0;
@ -444,6 +455,26 @@ sub to_butt_or_not_to_butt {
return ($rnd==0); return ($rnd==0);
} }
# FIX for
# http://code.google.com/p/buttbot/issues/detail?id=6
# Message must contain at least some word characters that we can butt.
sub _is_string_buttable {
my ($self, $str) = @_;
return $str =~ m/[a-zA-Z]+/;
}
# test if a string is the same as it was pre- and post-butting.
# returns true if strings are different
sub _was_string_butted {
my ($self, $in, $out) = @_;
my $meme = $self->config('meme');
# we can't trust whitespace, since we might have trimmed it differently.
$in =~ s/\s+//g;
$out =~ s/\s+//g;
return (lc($in) ne lc($out)) && ($out =~ m/\Q$meme\E/i);
}
sub might_be_a_bot { sub might_be_a_bot {
my ($self, $who) = @_; my ($self, $who) = @_;
return ($who =~ m/cout|(?:bot$)/i); return ($who =~ m/cout|(?:bot$)/i);

View File

@ -4,7 +4,7 @@ connection:
nick: butty nick: butty
ident: null ident: null
gecos: heh gecos: heh
channels: ['#m00p'] channels: ['#m33p']
settings: settings:
cmd_prefix: '!' cmd_prefix: '!'
meme: butt meme: butt

27
t/Whitespace.t Normal file
View File

@ -0,0 +1,27 @@
use strict;
use warnings;
use Test::More tests => 13;
BEGIN { use_ok('Butts'); }
foreach my $test_str (<DATA>) {
chomp($test_str);
my ($l, $words, $ws) = Butts::split_preserving_whitespace($test_str);
my $recovered_str = Butts::reassemble_with_whitespace($l, $words, $ws);
cmp_ok($recovered_str, 'eq', $test_str);
}
__DATA__
normal spaced string
leading space
trailing space
some extra inner space
leading and weird-sp-ace
leading and trailing
leading and weird in ner
lots of space
!!!
! !! @%@£
!! ; $$$ 55 5 fff