fixed distribution issues with random buttpicker algo in module, added whitespace preservation (with Test!), and probalby something else
This commit is contained in:
parent
756124c2d8
commit
ab9ba900f6
107
Butts.pm
107
Butts.pm
|
@ -5,6 +5,7 @@ use warnings;
|
||||||
|
|
||||||
use Math::Random;
|
use Math::Random;
|
||||||
use TeX::Hyphen;
|
use TeX::Hyphen;
|
||||||
|
use Data::Dumper;
|
||||||
use Carp;
|
use Carp;
|
||||||
|
|
||||||
use fields qw/replace_freq
|
use fields qw/replace_freq
|
||||||
|
@ -13,7 +14,9 @@ use fields qw/replace_freq
|
||||||
stopwords_file
|
stopwords_file
|
||||||
hyphenator
|
hyphenator
|
||||||
stopwords
|
stopwords
|
||||||
debug/;
|
debug
|
||||||
|
_words
|
||||||
|
_word_indices/;
|
||||||
|
|
||||||
sub new {
|
sub new {
|
||||||
my Butts $self = shift;
|
my Butts $self = shift;
|
||||||
|
@ -79,13 +82,53 @@ sub is_meme {
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
sub split_preserving_whitespace {
|
||||||
|
my ($string) = @_;
|
||||||
|
|
||||||
|
my ($leading_ws, $remainder) = ($string =~ m/^(\s*)(.*)$/);
|
||||||
|
$leading_ws //= '';
|
||||||
|
|
||||||
|
my @all_split = split(/(\s+)/, $remainder);
|
||||||
|
my (@words, @ws);
|
||||||
|
foreach my $tok (@all_split) {
|
||||||
|
if ($tok =~ m/^\s+$/) {
|
||||||
|
push @ws, $tok
|
||||||
|
} else {
|
||||||
|
push @words, $tok
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return ($leading_ws, \@words, \@ws);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
sub reassemble_with_whitespace {
|
||||||
|
my ($leading, $words, $ws) = @_;
|
||||||
|
|
||||||
|
# interleave the two arrays. Words always come first, because
|
||||||
|
# any leading space is in $leading.
|
||||||
|
# http://www.perlmonks.org/?node_id=53605
|
||||||
|
|
||||||
|
# if things are different sizes we'll end up with some undefs,
|
||||||
|
# so grep them out.
|
||||||
|
my @ret = grep { defined } map { $words->[$_], $ws->[$_] }
|
||||||
|
0 .. ($#$words > $#$ws ? $#$words : $#$ws);
|
||||||
|
# and convert back to a string.
|
||||||
|
return $leading . join('', @ret);
|
||||||
|
}
|
||||||
|
|
||||||
sub buttify_string($_) {
|
sub buttify_string($_) {
|
||||||
my $self = shift;
|
my $self = shift;
|
||||||
# glom a string from $_ if we didn't get one passed.
|
# glom a string from $_ if we didn't get one passed.
|
||||||
my $str = (@_ ? $_[0] : $_);
|
my $str = (@_ ? $_[0] : $_);
|
||||||
chomp($str);
|
chomp($str);
|
||||||
my @words = split /\s+/, $str;
|
|
||||||
return join(" ", $self->buttify(@words));
|
# FIX for http://code.google.com/p/buttbot/issues/detail?id=7
|
||||||
|
my ($leading, $words, $whitespace)
|
||||||
|
= split_preserving_whitespace($str);
|
||||||
|
|
||||||
|
my @butted_words = $self->buttify(@$words);
|
||||||
|
|
||||||
|
return reassemble_with_whitespace($leading, \@butted_words, $whitespace);
|
||||||
}
|
}
|
||||||
|
|
||||||
sub buttify {
|
sub buttify {
|
||||||
|
@ -93,6 +136,7 @@ sub buttify {
|
||||||
my $how_many_butts = int(@words * $self->{replace_freq}) + 1;
|
my $how_many_butts = int(@words * $self->{replace_freq}) + 1;
|
||||||
my $debug = $self->{debug};
|
my $debug = $self->{debug};
|
||||||
|
|
||||||
|
$self->{_words} = \@words;
|
||||||
# sort indices by word length
|
# sort indices by word length
|
||||||
my @word_idxs_len_sorted = do {
|
my @word_idxs_len_sorted = do {
|
||||||
my $c;
|
my $c;
|
||||||
|
@ -113,6 +157,8 @@ sub buttify {
|
||||||
$is_word and not $is_stop and not $is_meme;
|
$is_word and not $is_stop and not $is_meme;
|
||||||
} @word_idxs_len_sorted;
|
} @word_idxs_len_sorted;
|
||||||
|
|
||||||
|
$self->{_word_indices} = \@word_idxs_len_sorted;
|
||||||
|
|
||||||
# bail out if we've got nothing left. This happens
|
# bail out if we've got nothing left. This happens
|
||||||
# when a string is comprised entirely of stop-words.
|
# when a string is comprised entirely of stop-words.
|
||||||
unless (@word_idxs_len_sorted) {
|
unless (@word_idxs_len_sorted) {
|
||||||
|
@ -128,14 +174,24 @@ sub buttify {
|
||||||
|
|
||||||
$self->log("buttifying with $how_many_butts repetitions");
|
$self->log("buttifying with $how_many_butts repetitions");
|
||||||
my $words_butted = {};
|
my $words_butted = {};
|
||||||
|
my @initial_weights = _sq_weight_indices(scalar @word_idxs_len_sorted);
|
||||||
|
|
||||||
my ($non_butted_indices, $xx_n, $xx_p, $xx_x)
|
# Selecting words to butt works in the following way:
|
||||||
= $self->_build_weighted_index(\@word_idxs_len_sorted, $words_butted);
|
|
||||||
|
# * each (non-stop) word-index is assigned a weighting based on it's
|
||||||
|
# ordinal when sorted by (word) length. So, the longest word has weight =
|
||||||
|
# num_words ** 2, second longest is (num_words-1)**2, ...
|
||||||
|
|
||||||
|
# * A random distribution selects some index proportional to its weight.
|
||||||
|
# * The word at this index is butted.
|
||||||
|
# * The index is removed from consideration for subsequent buttings.
|
||||||
|
|
||||||
for my $c (0 .. $how_many_butts-1) {
|
for my $c (0 .. $how_many_butts-1) {
|
||||||
|
my ($xx_n, $xx_p, $xx_x)
|
||||||
|
= $self->_build_weightings_for_index(\@initial_weights, $words_butted);
|
||||||
|
|
||||||
my $random_idx = get_walker_rand($xx_n, $xx_p, $xx_x);
|
my $random_idx = get_walker_rand($xx_n, $xx_p, $xx_x);
|
||||||
my $idx_to_butt = $non_butted_indices->[$random_idx];
|
my $idx_to_butt = $word_idxs_len_sorted[$random_idx];
|
||||||
|
|
||||||
$self->log("Butting word idx: $idx_to_butt [",
|
$self->log("Butting word idx: $idx_to_butt [",
|
||||||
$words[$idx_to_butt], "]");
|
$words[$idx_to_butt], "]");
|
||||||
|
@ -143,10 +199,7 @@ sub buttify {
|
||||||
$words[$idx_to_butt]
|
$words[$idx_to_butt]
|
||||||
= $self->_buttsub($words[$idx_to_butt]);
|
= $self->_buttsub($words[$idx_to_butt]);
|
||||||
|
|
||||||
$words_butted->{$idx_to_butt} = 1;
|
$words_butted->{$random_idx} = 1;
|
||||||
($non_butted_indices, $xx_n, $xx_p, $xx_x)
|
|
||||||
= $self->_build_weighted_index(\@word_idxs_len_sorted, $words_butted);
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return @words;
|
return @words;
|
||||||
|
@ -161,7 +214,7 @@ sub _buttsub {
|
||||||
my ($lp, $actual_word, $rp) = $word =~ /^([^A-Za-z]*)(.*?)([^A-Za-z]*)$/;
|
my ($lp, $actual_word, $rp) = $word =~ /^([^A-Za-z]*)(.*?)([^A-Za-z]*)$/;
|
||||||
|
|
||||||
return $word unless $actual_word;
|
return $word unless $actual_word;
|
||||||
|
|
||||||
my @points = (0, $self->{hyphenator}->hyphenate($actual_word));
|
my @points = (0, $self->{hyphenator}->hyphenate($actual_word));
|
||||||
|
|
||||||
my $factor = 2;
|
my $factor = 2;
|
||||||
|
@ -171,7 +224,7 @@ sub _buttsub {
|
||||||
|
|
||||||
my $l = $points[$replace];
|
my $l = $points[$replace];
|
||||||
my $r = $points[$replace + 1] - $l;
|
my $r = $points[$replace + 1] - $l;
|
||||||
|
|
||||||
while (substr($actual_word, $l + $r, 1) eq 't') {
|
while (substr($actual_word, $l + $r, 1) eq 't') {
|
||||||
$r++;
|
$r++;
|
||||||
}
|
}
|
||||||
|
@ -186,26 +239,34 @@ sub _buttsub {
|
||||||
} elsif ($sub =~/^[A-Z]/) {
|
} elsif ($sub =~/^[A-Z]/) {
|
||||||
$butt = ucfirst($meme);
|
$butt = ucfirst($meme);
|
||||||
}
|
}
|
||||||
|
|
||||||
substr($actual_word, $l, $r) = $butt;
|
substr($actual_word, $l, $r) = $butt;
|
||||||
return $lp . $actual_word . $rp;
|
return $lp . $actual_word . $rp;
|
||||||
}
|
}
|
||||||
|
|
||||||
sub _build_weighted_index {
|
sub _build_weightings_for_index {
|
||||||
my $self = shift;
|
my ($self, $initial_weights, $butted_indices) = @_;
|
||||||
my (@indices) = @{ shift() },
|
|
||||||
my $butted_indices = shift;
|
|
||||||
|
|
||||||
$self->log("Word indices remaining: ", @indices);
|
#$self->log("Word indices remaining: ", @indices);
|
||||||
|
|
||||||
my @non_butted_indices = grep { !exists ($butted_indices->{$_}) } @indices;
|
my $i = 0;
|
||||||
my @idx_weights = _sq_weight_indices(scalar @non_butted_indices);
|
$self->log(Dumper($butted_indices));
|
||||||
|
$self->log(Dumper($initial_weights));
|
||||||
|
|
||||||
$self->log('index weightings: ',
|
my @idx_weights = map {
|
||||||
join(", ", @idx_weights));
|
exists($butted_indices->{$i++})?0:$_
|
||||||
|
} @$initial_weights;
|
||||||
|
|
||||||
|
my $str;
|
||||||
|
$i = 0;
|
||||||
|
for (@{$self->{_word_indices}}) {
|
||||||
|
$str .= "\tIndex: $_: " . $self->{_words}->[$_]
|
||||||
|
. " ,weight=" . $idx_weights[$i++] . "\n";
|
||||||
|
}
|
||||||
|
$self->log("index weightings:\n" . $str);
|
||||||
|
|
||||||
my ($n, $p, $x) = setup_walker_rand(\@idx_weights);
|
my ($n, $p, $x) = setup_walker_rand(\@idx_weights);
|
||||||
return (\@non_butted_indices, $n, $p, $x)
|
return ($n, $p, $x)
|
||||||
}
|
}
|
||||||
|
|
||||||
sub _sq_weight_indices {
|
sub _sq_weight_indices {
|
||||||
|
|
|
@ -227,7 +227,7 @@ sub handle_said_emoted {
|
||||||
|
|
||||||
# butting is the default behaviour.
|
# butting is the default behaviour.
|
||||||
$self->log("BUTT: Might butt\n");
|
$self->log("BUTT: Might butt\n");
|
||||||
if ($self->to_butt_or_not_to_butt($who)) {
|
if ($self->to_butt_or_not_to_butt($who, $body)) {
|
||||||
$self->log("BUTT: Butting $who in [$channel]\n");
|
$self->log("BUTT: Butting $who in [$channel]\n");
|
||||||
$self->buttify_message($who, $channel, $body, $reply_as_emote, 0);
|
$self->buttify_message($who, $channel, $body, $reply_as_emote, 0);
|
||||||
}
|
}
|
||||||
|
@ -413,6 +413,11 @@ sub buttify_message {
|
||||||
|
|
||||||
my $butt_msg = $self->{butter}->buttify_string($what);
|
my $butt_msg = $self->{butter}->buttify_string($what);
|
||||||
|
|
||||||
|
unless ($self->_was_string_butted($what, $butt_msg)) {
|
||||||
|
$self->log("BUTT: String \"$butt_msg\" wasn't butted");
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
if ($reply_as_emote) {
|
if ($reply_as_emote) {
|
||||||
$self->emote(channel => $where, who => $who,
|
$self->emote(channel => $where, who => $who,
|
||||||
body => $butt_msg, address => 0);
|
body => $butt_msg, address => 0);
|
||||||
|
@ -420,16 +425,22 @@ sub buttify_message {
|
||||||
$self->say(channel => $where, who => $who,
|
$self->say(channel => $where, who => $who,
|
||||||
body => $butt_msg, address => $prefix_addressee);
|
body => $butt_msg, address => $prefix_addressee);
|
||||||
}
|
}
|
||||||
1;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
sub to_butt_or_not_to_butt {
|
sub to_butt_or_not_to_butt {
|
||||||
my ($self, $sufferer) = @_;
|
my ($self, $sufferer, $message) = @_;
|
||||||
my $rnd_max = 0;
|
my $rnd_max = 0;
|
||||||
my $frequencies = $self->config('frequency');
|
my $frequencies = $self->config('frequency');
|
||||||
|
|
||||||
return 0 if $self->might_be_a_bot($sufferer);
|
return 0 if $self->might_be_a_bot($sufferer);
|
||||||
|
|
||||||
|
# Fixes issue 6.
|
||||||
|
unless ($self->_is_string_buttable($message)) {
|
||||||
|
$self->log("BUTT: String is not buttable");
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
if ($self->is_enemy($sufferer)) {
|
if ($self->is_enemy($sufferer)) {
|
||||||
$self->log("BUTT: [$sufferer:enemy] not butting\n");
|
$self->log("BUTT: [$sufferer:enemy] not butting\n");
|
||||||
return 0;
|
return 0;
|
||||||
|
@ -444,6 +455,26 @@ sub to_butt_or_not_to_butt {
|
||||||
return ($rnd==0);
|
return ($rnd==0);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
# FIX for
|
||||||
|
# http://code.google.com/p/buttbot/issues/detail?id=6
|
||||||
|
# Message must contain at least some word characters that we can butt.
|
||||||
|
sub _is_string_buttable {
|
||||||
|
my ($self, $str) = @_;
|
||||||
|
return $str =~ m/[a-zA-Z]+/;
|
||||||
|
}
|
||||||
|
|
||||||
|
# test if a string is the same as it was pre- and post-butting.
|
||||||
|
# returns true if strings are different
|
||||||
|
sub _was_string_butted {
|
||||||
|
my ($self, $in, $out) = @_;
|
||||||
|
my $meme = $self->config('meme');
|
||||||
|
|
||||||
|
# we can't trust whitespace, since we might have trimmed it differently.
|
||||||
|
$in =~ s/\s+//g;
|
||||||
|
$out =~ s/\s+//g;
|
||||||
|
return (lc($in) ne lc($out)) && ($out =~ m/\Q$meme\E/i);
|
||||||
|
}
|
||||||
|
|
||||||
sub might_be_a_bot {
|
sub might_be_a_bot {
|
||||||
my ($self, $who) = @_;
|
my ($self, $who) = @_;
|
||||||
return ($who =~ m/cout|(?:bot$)/i);
|
return ($who =~ m/cout|(?:bot$)/i);
|
||||||
|
|
|
@ -4,7 +4,7 @@ connection:
|
||||||
nick: butty
|
nick: butty
|
||||||
ident: null
|
ident: null
|
||||||
gecos: heh
|
gecos: heh
|
||||||
channels: ['#m00p']
|
channels: ['#m33p']
|
||||||
settings:
|
settings:
|
||||||
cmd_prefix: '!'
|
cmd_prefix: '!'
|
||||||
meme: butt
|
meme: butt
|
||||||
|
|
|
@ -0,0 +1,27 @@
|
||||||
|
use strict;
|
||||||
|
use warnings;
|
||||||
|
|
||||||
|
use Test::More tests => 13;
|
||||||
|
|
||||||
|
BEGIN { use_ok('Butts'); }
|
||||||
|
|
||||||
|
foreach my $test_str (<DATA>) {
|
||||||
|
chomp($test_str);
|
||||||
|
my ($l, $words, $ws) = Butts::split_preserving_whitespace($test_str);
|
||||||
|
my $recovered_str = Butts::reassemble_with_whitespace($l, $words, $ws);
|
||||||
|
cmp_ok($recovered_str, 'eq', $test_str);
|
||||||
|
}
|
||||||
|
|
||||||
|
__DATA__
|
||||||
|
normal spaced string
|
||||||
|
leading space
|
||||||
|
trailing space
|
||||||
|
some extra inner space
|
||||||
|
leading and weird-sp-ace
|
||||||
|
leading and trailing
|
||||||
|
leading and weird in ner
|
||||||
|
lots of space
|
||||||
|
|
||||||
|
!!!
|
||||||
|
! !! @%@£
|
||||||
|
!! ; $$$ 55 5 fff
|
Loading…
Reference in New Issue