fixed distribution issues with random buttpicker algo in module, added whitespace preservation (with Test!), and probalby something else
This commit is contained in:
parent
756124c2d8
commit
ab9ba900f6
101
Butts.pm
101
Butts.pm
|
@ -5,6 +5,7 @@ use warnings;
|
|||
|
||||
use Math::Random;
|
||||
use TeX::Hyphen;
|
||||
use Data::Dumper;
|
||||
use Carp;
|
||||
|
||||
use fields qw/replace_freq
|
||||
|
@ -13,7 +14,9 @@ use fields qw/replace_freq
|
|||
stopwords_file
|
||||
hyphenator
|
||||
stopwords
|
||||
debug/;
|
||||
debug
|
||||
_words
|
||||
_word_indices/;
|
||||
|
||||
sub new {
|
||||
my Butts $self = shift;
|
||||
|
@ -79,13 +82,53 @@ sub is_meme {
|
|||
|
||||
}
|
||||
|
||||
sub split_preserving_whitespace {
|
||||
my ($string) = @_;
|
||||
|
||||
my ($leading_ws, $remainder) = ($string =~ m/^(\s*)(.*)$/);
|
||||
$leading_ws //= '';
|
||||
|
||||
my @all_split = split(/(\s+)/, $remainder);
|
||||
my (@words, @ws);
|
||||
foreach my $tok (@all_split) {
|
||||
if ($tok =~ m/^\s+$/) {
|
||||
push @ws, $tok
|
||||
} else {
|
||||
push @words, $tok
|
||||
}
|
||||
}
|
||||
return ($leading_ws, \@words, \@ws);
|
||||
}
|
||||
|
||||
|
||||
sub reassemble_with_whitespace {
|
||||
my ($leading, $words, $ws) = @_;
|
||||
|
||||
# interleave the two arrays. Words always come first, because
|
||||
# any leading space is in $leading.
|
||||
# http://www.perlmonks.org/?node_id=53605
|
||||
|
||||
# if things are different sizes we'll end up with some undefs,
|
||||
# so grep them out.
|
||||
my @ret = grep { defined } map { $words->[$_], $ws->[$_] }
|
||||
0 .. ($#$words > $#$ws ? $#$words : $#$ws);
|
||||
# and convert back to a string.
|
||||
return $leading . join('', @ret);
|
||||
}
|
||||
|
||||
sub buttify_string($_) {
|
||||
my $self = shift;
|
||||
# glom a string from $_ if we didn't get one passed.
|
||||
my $str = (@_ ? $_[0] : $_);
|
||||
chomp($str);
|
||||
my @words = split /\s+/, $str;
|
||||
return join(" ", $self->buttify(@words));
|
||||
|
||||
# FIX for http://code.google.com/p/buttbot/issues/detail?id=7
|
||||
my ($leading, $words, $whitespace)
|
||||
= split_preserving_whitespace($str);
|
||||
|
||||
my @butted_words = $self->buttify(@$words);
|
||||
|
||||
return reassemble_with_whitespace($leading, \@butted_words, $whitespace);
|
||||
}
|
||||
|
||||
sub buttify {
|
||||
|
@ -93,6 +136,7 @@ sub buttify {
|
|||
my $how_many_butts = int(@words * $self->{replace_freq}) + 1;
|
||||
my $debug = $self->{debug};
|
||||
|
||||
$self->{_words} = \@words;
|
||||
# sort indices by word length
|
||||
my @word_idxs_len_sorted = do {
|
||||
my $c;
|
||||
|
@ -113,6 +157,8 @@ sub buttify {
|
|||
$is_word and not $is_stop and not $is_meme;
|
||||
} @word_idxs_len_sorted;
|
||||
|
||||
$self->{_word_indices} = \@word_idxs_len_sorted;
|
||||
|
||||
# bail out if we've got nothing left. This happens
|
||||
# when a string is comprised entirely of stop-words.
|
||||
unless (@word_idxs_len_sorted) {
|
||||
|
@ -128,14 +174,24 @@ sub buttify {
|
|||
|
||||
$self->log("buttifying with $how_many_butts repetitions");
|
||||
my $words_butted = {};
|
||||
my @initial_weights = _sq_weight_indices(scalar @word_idxs_len_sorted);
|
||||
|
||||
my ($non_butted_indices, $xx_n, $xx_p, $xx_x)
|
||||
= $self->_build_weighted_index(\@word_idxs_len_sorted, $words_butted);
|
||||
# Selecting words to butt works in the following way:
|
||||
|
||||
# * each (non-stop) word-index is assigned a weighting based on it's
|
||||
# ordinal when sorted by (word) length. So, the longest word has weight =
|
||||
# num_words ** 2, second longest is (num_words-1)**2, ...
|
||||
|
||||
# * A random distribution selects some index proportional to its weight.
|
||||
# * The word at this index is butted.
|
||||
# * The index is removed from consideration for subsequent buttings.
|
||||
|
||||
for my $c (0 .. $how_many_butts-1) {
|
||||
my ($xx_n, $xx_p, $xx_x)
|
||||
= $self->_build_weightings_for_index(\@initial_weights, $words_butted);
|
||||
|
||||
my $random_idx = get_walker_rand($xx_n, $xx_p, $xx_x);
|
||||
my $idx_to_butt = $non_butted_indices->[$random_idx];
|
||||
my $idx_to_butt = $word_idxs_len_sorted[$random_idx];
|
||||
|
||||
$self->log("Butting word idx: $idx_to_butt [",
|
||||
$words[$idx_to_butt], "]");
|
||||
|
@ -143,10 +199,7 @@ sub buttify {
|
|||
$words[$idx_to_butt]
|
||||
= $self->_buttsub($words[$idx_to_butt]);
|
||||
|
||||
$words_butted->{$idx_to_butt} = 1;
|
||||
($non_butted_indices, $xx_n, $xx_p, $xx_x)
|
||||
= $self->_build_weighted_index(\@word_idxs_len_sorted, $words_butted);
|
||||
|
||||
$words_butted->{$random_idx} = 1;
|
||||
}
|
||||
|
||||
return @words;
|
||||
|
@ -191,21 +244,29 @@ sub _buttsub {
|
|||
return $lp . $actual_word . $rp;
|
||||
}
|
||||
|
||||
sub _build_weighted_index {
|
||||
my $self = shift;
|
||||
my (@indices) = @{ shift() },
|
||||
my $butted_indices = shift;
|
||||
sub _build_weightings_for_index {
|
||||
my ($self, $initial_weights, $butted_indices) = @_;
|
||||
|
||||
$self->log("Word indices remaining: ", @indices);
|
||||
#$self->log("Word indices remaining: ", @indices);
|
||||
|
||||
my @non_butted_indices = grep { !exists ($butted_indices->{$_}) } @indices;
|
||||
my @idx_weights = _sq_weight_indices(scalar @non_butted_indices);
|
||||
my $i = 0;
|
||||
$self->log(Dumper($butted_indices));
|
||||
$self->log(Dumper($initial_weights));
|
||||
|
||||
$self->log('index weightings: ',
|
||||
join(", ", @idx_weights));
|
||||
my @idx_weights = map {
|
||||
exists($butted_indices->{$i++})?0:$_
|
||||
} @$initial_weights;
|
||||
|
||||
my $str;
|
||||
$i = 0;
|
||||
for (@{$self->{_word_indices}}) {
|
||||
$str .= "\tIndex: $_: " . $self->{_words}->[$_]
|
||||
. " ,weight=" . $idx_weights[$i++] . "\n";
|
||||
}
|
||||
$self->log("index weightings:\n" . $str);
|
||||
|
||||
my ($n, $p, $x) = setup_walker_rand(\@idx_weights);
|
||||
return (\@non_butted_indices, $n, $p, $x)
|
||||
return ($n, $p, $x)
|
||||
}
|
||||
|
||||
sub _sq_weight_indices {
|
||||
|
|
|
@ -227,7 +227,7 @@ sub handle_said_emoted {
|
|||
|
||||
# butting is the default behaviour.
|
||||
$self->log("BUTT: Might butt\n");
|
||||
if ($self->to_butt_or_not_to_butt($who)) {
|
||||
if ($self->to_butt_or_not_to_butt($who, $body)) {
|
||||
$self->log("BUTT: Butting $who in [$channel]\n");
|
||||
$self->buttify_message($who, $channel, $body, $reply_as_emote, 0);
|
||||
}
|
||||
|
@ -413,6 +413,11 @@ sub buttify_message {
|
|||
|
||||
my $butt_msg = $self->{butter}->buttify_string($what);
|
||||
|
||||
unless ($self->_was_string_butted($what, $butt_msg)) {
|
||||
$self->log("BUTT: String \"$butt_msg\" wasn't butted");
|
||||
return 0;
|
||||
}
|
||||
|
||||
if ($reply_as_emote) {
|
||||
$self->emote(channel => $where, who => $who,
|
||||
body => $butt_msg, address => 0);
|
||||
|
@ -420,16 +425,22 @@ sub buttify_message {
|
|||
$self->say(channel => $where, who => $who,
|
||||
body => $butt_msg, address => $prefix_addressee);
|
||||
}
|
||||
1;
|
||||
return;
|
||||
}
|
||||
|
||||
sub to_butt_or_not_to_butt {
|
||||
my ($self, $sufferer) = @_;
|
||||
my ($self, $sufferer, $message) = @_;
|
||||
my $rnd_max = 0;
|
||||
my $frequencies = $self->config('frequency');
|
||||
|
||||
return 0 if $self->might_be_a_bot($sufferer);
|
||||
|
||||
# Fixes issue 6.
|
||||
unless ($self->_is_string_buttable($message)) {
|
||||
$self->log("BUTT: String is not buttable");
|
||||
return 0;
|
||||
}
|
||||
|
||||
if ($self->is_enemy($sufferer)) {
|
||||
$self->log("BUTT: [$sufferer:enemy] not butting\n");
|
||||
return 0;
|
||||
|
@ -444,6 +455,26 @@ sub to_butt_or_not_to_butt {
|
|||
return ($rnd==0);
|
||||
}
|
||||
|
||||
# FIX for
|
||||
# http://code.google.com/p/buttbot/issues/detail?id=6
|
||||
# Message must contain at least some word characters that we can butt.
|
||||
sub _is_string_buttable {
|
||||
my ($self, $str) = @_;
|
||||
return $str =~ m/[a-zA-Z]+/;
|
||||
}
|
||||
|
||||
# test if a string is the same as it was pre- and post-butting.
|
||||
# returns true if strings are different
|
||||
sub _was_string_butted {
|
||||
my ($self, $in, $out) = @_;
|
||||
my $meme = $self->config('meme');
|
||||
|
||||
# we can't trust whitespace, since we might have trimmed it differently.
|
||||
$in =~ s/\s+//g;
|
||||
$out =~ s/\s+//g;
|
||||
return (lc($in) ne lc($out)) && ($out =~ m/\Q$meme\E/i);
|
||||
}
|
||||
|
||||
sub might_be_a_bot {
|
||||
my ($self, $who) = @_;
|
||||
return ($who =~ m/cout|(?:bot$)/i);
|
||||
|
|
|
@ -4,7 +4,7 @@ connection:
|
|||
nick: butty
|
||||
ident: null
|
||||
gecos: heh
|
||||
channels: ['#m00p']
|
||||
channels: ['#m33p']
|
||||
settings:
|
||||
cmd_prefix: '!'
|
||||
meme: butt
|
||||
|
|
|
@ -0,0 +1,27 @@
|
|||
use strict;
|
||||
use warnings;
|
||||
|
||||
use Test::More tests => 13;
|
||||
|
||||
BEGIN { use_ok('Butts'); }
|
||||
|
||||
foreach my $test_str (<DATA>) {
|
||||
chomp($test_str);
|
||||
my ($l, $words, $ws) = Butts::split_preserving_whitespace($test_str);
|
||||
my $recovered_str = Butts::reassemble_with_whitespace($l, $words, $ws);
|
||||
cmp_ok($recovered_str, 'eq', $test_str);
|
||||
}
|
||||
|
||||
__DATA__
|
||||
normal spaced string
|
||||
leading space
|
||||
trailing space
|
||||
some extra inner space
|
||||
leading and weird-sp-ace
|
||||
leading and trailing
|
||||
leading and weird in ner
|
||||
lots of space
|
||||
|
||||
!!!
|
||||
! !! @%@£
|
||||
!! ; $$$ 55 5 fff
|
Loading…
Reference in New Issue