fixed distribution issues with random buttpicker algo in module, added whitespace preservation (with Test!), and probalby something else

2009-10-28 21:57:59 +00:00 · 2009-10-28 21:57:59 +00:00 · ab9ba900f6
parent 756124c2d8
commit ab9ba900f6
4 changed files with 146 additions and 27 deletions
--- a/Butts.pm
+++ b/Butts.pm
@ -5,6 +5,7 @@ use warnings;
 use Math::Random;
 use TeX::Hyphen;
 use Data::Dumper;
 use Carp;
 use fields qw/replace_freq
@ -13,7 +14,9 @@ use fields qw/replace_freq
              stopwords_file
              hyphenator
              stopwords
-              debug/;
+              debug
              _words
              _word_indices/;
 sub new {
    my Butts $self = shift;
@ -79,13 +82,53 @@ sub is_meme {
 }
 sub split_preserving_whitespace {
    my ($string) = @_;
    my ($leading_ws, $remainder) = ($string =~ m/^(\s*)(.*)$/);
    $leading_ws //= '';
    my @all_split = split(/(\s+)/, $remainder);
    my (@words, @ws);
    foreach my $tok (@all_split) {
        if ($tok =~ m/^\s+$/) {
            push @ws, $tok
        } else {
            push @words, $tok
        }
    }
    return ($leading_ws, \@words, \@ws);
 }
 sub reassemble_with_whitespace {
    my ($leading, $words, $ws) = @_;
    # interleave the two arrays. Words always come first, because
    # any leading space is in $leading.
    # http://www.perlmonks.org/?node_id=53605
    # if things are different sizes we'll end up with some undefs,
    # so grep them out.
    my @ret = grep { defined } map { $words->[$_], $ws->[$_] }
      0 .. ($#$words > $#$ws ? $#$words : $#$ws);
    # and convert back to a string.
    return $leading . join('', @ret);
 }
 sub buttify_string($_) {
    my $self = shift;
    # glom a string from $_ if we didn't get one passed.
    my $str = (@_ ? $_[0] : $_);
    chomp($str);
-    my @words = split /\s+/, $str;
+
-    return join(" ", $self->buttify(@words));
+    # FIX for http://code.google.com/p/buttbot/issues/detail?id=7
    my ($leading, $words, $whitespace)
      = split_preserving_whitespace($str);
    my @butted_words = $self->buttify(@$words);
    return reassemble_with_whitespace($leading, \@butted_words, $whitespace);
 }
 sub buttify {
@ -93,6 +136,7 @@ sub buttify {
 	my $how_many_butts = int(@words * $self->{replace_freq}) + 1;
    my $debug = $self->{debug};
    $self->{_words} = \@words;
 	# sort indices by word length
 	my @word_idxs_len_sorted = do {
 		my $c;
@ -113,6 +157,8 @@ sub buttify {
 		$is_word and not $is_stop and not $is_meme;
 	} @word_idxs_len_sorted;
    $self->{_word_indices} = \@word_idxs_len_sorted;
    # bail out if we've got nothing left. This happens
    # when a string is comprised entirely of stop-words.
    unless (@word_idxs_len_sorted) {
@ -128,14 +174,24 @@ sub buttify {
    $self->log("buttifying with $how_many_butts repetitions");
    my $words_butted = {};
    my @initial_weights = _sq_weight_indices(scalar @word_idxs_len_sorted);
-    my ($non_butted_indices, $xx_n, $xx_p, $xx_x) 
+    # Selecting words to butt works in the following way:
-      = $self->_build_weighted_index(\@word_idxs_len_sorted, $words_butted);
+
    #  * each (non-stop) word-index is assigned a weighting based on it's
    #    ordinal when sorted by (word) length. So, the longest word has weight =
    #    num_words ** 2, second longest is (num_words-1)**2, ...
    #  * A random distribution selects some index proportional to its weight.
    #  * The word at this index is butted.
    #  * The index is removed from consideration for subsequent buttings.
 	for my $c (0 .. $how_many_butts-1) {
        my ($xx_n, $xx_p, $xx_x)
          = $self->_build_weightings_for_index(\@initial_weights, $words_butted);
        my $random_idx  = get_walker_rand($xx_n, $xx_p, $xx_x);
-        my $idx_to_butt = $non_butted_indices->[$random_idx];
+        my $idx_to_butt = $word_idxs_len_sorted[$random_idx];
        $self->log("Butting word idx: $idx_to_butt [",
                   $words[$idx_to_butt], "]");
@ -143,10 +199,7 @@ sub buttify {
 		$words[$idx_to_butt]
          = $self->_buttsub($words[$idx_to_butt]);
-        $words_butted->{$idx_to_butt} = 1;
+        $words_butted->{$random_idx} = 1;
        ($non_butted_indices, $xx_n, $xx_p, $xx_x) 
          = $self->_build_weighted_index(\@word_idxs_len_sorted, $words_butted);
 	}
 	return @words;
@ -161,7 +214,7 @@ sub _buttsub {
 	my ($lp, $actual_word, $rp) = $word =~ /^([^A-Za-z]*)(.*?)([^A-Za-z]*)$/;
 	return $word unless $actual_word;
-	
+
 	my @points = (0, $self->{hyphenator}->hyphenate($actual_word));
 	my $factor = 2;
@ -171,7 +224,7 @@ sub _buttsub {
 	my $l = $points[$replace];
 	my $r = $points[$replace + 1] - $l;
-	
+
 	while (substr($actual_word, $l + $r, 1) eq 't') {
        $r++;
    }
@ -186,26 +239,34 @@ sub _buttsub {
 	} elsif ($sub =~/^[A-Z]/) {
 		$butt = ucfirst($meme);
 	}
-	
+
 	substr($actual_word, $l, $r) = $butt;
 	return $lp . $actual_word . $rp;
 }
-sub _build_weighted_index {
+sub _build_weightings_for_index {
-    my $self = shift;
+    my ($self, $initial_weights, $butted_indices) = @_;
    my (@indices) = @{ shift() },
    my $butted_indices = shift;
-    $self->log("Word indices remaining: ", @indices);
+    #$self->log("Word indices remaining: ", @indices);
-    my @non_butted_indices = grep { !exists ($butted_indices->{$_}) } @indices;
+    my $i = 0;
-    my @idx_weights = _sq_weight_indices(scalar @non_butted_indices);
+    $self->log(Dumper($butted_indices));
    $self->log(Dumper($initial_weights));
-    $self->log('index weightings: ',
+    my @idx_weights = map {
-               join(", ", @idx_weights));
+        exists($butted_indices->{$i++})?0:$_
    } @$initial_weights;
    my $str;
    $i = 0;
    for (@{$self->{_word_indices}}) {
        $str .= "\tIndex: $_: " . $self->{_words}->[$_]
          . " ,weight=" . $idx_weights[$i++] . "\n";
    }
    $self->log("index weightings:\n" . $str);
    my ($n, $p, $x) = setup_walker_rand(\@idx_weights);
-    return (\@non_butted_indices, $n, $p, $x)
+    return ($n, $p, $x)
 }
 sub _sq_weight_indices {
--- a/contrib/basicbuttbot.pl
+++ b/contrib/basicbuttbot.pl
@ -227,7 +227,7 @@ sub handle_said_emoted {
    # butting is the default behaviour.
    $self->log("BUTT: Might butt\n");
-    if ($self->to_butt_or_not_to_butt($who)) {
+    if ($self->to_butt_or_not_to_butt($who, $body)) {
        $self->log("BUTT: Butting $who in [$channel]\n");
        $self->buttify_message($who, $channel, $body, $reply_as_emote, 0);
    }
@ -413,6 +413,11 @@ sub buttify_message {
    my $butt_msg = $self->{butter}->buttify_string($what);
    unless ($self->_was_string_butted($what, $butt_msg)) {
        $self->log("BUTT: String \"$butt_msg\" wasn't butted");
        return 0;
    }
    if ($reply_as_emote) {
        $self->emote(channel => $where, who => $who,
                   body => $butt_msg, address => 0);
@ -420,16 +425,22 @@ sub buttify_message {
        $self->say(channel => $where, who => $who,
                   body => $butt_msg, address => $prefix_addressee);
    }
-    1;
+    return;
 }
 sub to_butt_or_not_to_butt {
-    my ($self, $sufferer) = @_;
+    my ($self, $sufferer, $message) = @_;
    my $rnd_max = 0;
    my $frequencies = $self->config('frequency');
    return 0 if $self->might_be_a_bot($sufferer);
    # Fixes issue 6.
    unless ($self->_is_string_buttable($message)) {
        $self->log("BUTT: String is not buttable");
        return 0;
    }
    if ($self->is_enemy($sufferer)) {
        $self->log("BUTT: [$sufferer:enemy] not butting\n");
        return 0;
@ -444,6 +455,26 @@ sub to_butt_or_not_to_butt {
    return ($rnd==0);
 }
 # FIX for
 # http://code.google.com/p/buttbot/issues/detail?id=6
 # Message must contain at least some word characters that we can butt.
 sub _is_string_buttable {
    my ($self, $str) = @_;
    return $str =~ m/[a-zA-Z]+/;
 }
 # test if a string is the same as it was pre- and post-butting.
 # returns true if strings are different
 sub _was_string_butted {
    my ($self, $in, $out) = @_;
    my $meme = $self->config('meme');
    # we can't trust whitespace, since we might have trimmed it differently.
    $in =~ s/\s+//g;
    $out =~ s/\s+//g;
    return (lc($in) ne lc($out)) && ($out =~ m/\Q$meme\E/i);
 }
 sub might_be_a_bot {
    my ($self, $who) = @_;
    return ($who =~ m/cout|(?:bot$)/i);
--- a/contrib/conf.yml
+++ b/contrib/conf.yml
@ -4,7 +4,7 @@ connection:
    nick: butty
    ident: null
    gecos: heh
-    channels: ['#m00p']
+    channels: ['#m33p']
 settings:
    cmd_prefix: '!'
    meme:       butt
--- a/t/Whitespace.t
+++ b/t/Whitespace.t
@ -0,0 +1,27 @@
 use strict;
 use warnings;
 use Test::More tests => 13;
 BEGIN { use_ok('Butts'); }
 foreach my $test_str (<DATA>) {
    chomp($test_str);
    my ($l, $words, $ws) = Butts::split_preserving_whitespace($test_str);
    my $recovered_str = Butts::reassemble_with_whitespace($l, $words, $ws);
    cmp_ok($recovered_str, 'eq', $test_str);
 }
 __DATA__
 normal spaced string
 leading space
 trailing space 
 some  extra   inner space
 leading  and   weird-sp-ace
 leading and trailing 
 leading and  weird in       ner
   lots     of    space    
 !!!  
   ! !! @%@£
 !! ; $$$ 55    5	fff