=head1 NAME Butts - replace random syllables with the arbitrary memes. =head1 SYNOPSIS # with all defaults my $butter = Butts->new; $butter->buttify_string("hello there"); # with all known options my $butter = Butts->new( meme => 'butt', replace_freq => (1/11), debug => 0, hyphen_file => 'hyphens.tex', stopwords_file => 'stopwords', ); $butter->buttify(@tokens); $butter->buttify_string($string); =head1 DESCRIPTION Yes. =cut =head1 CONSTRUCTOR =over =item C Takes a number of optional arguments: 'debug', sets module debugging output on or off. 'hyphen_file', specify a different hyphen file for L, defaults to C<$module_dir/hyphen.tex>. 'stopwords_file', specify a different source of stopwords, defaults to C<$module_dir/stopwords>. =back =cut use strict; use warnings; { package Butts; use Moose; use Math::Random; use TeX::Hyphen; use Data::Dumper; use Dir::Self; use Carp; has 'replace_freq' => ( isa => 'Num', is => 'rw', default => sub { 1/11 } ); has 'meme' => ( isa => 'Str', is => 'rw', default => sub { 'butt' }, required => 1, ); has 'hyphen_file' => ( isa => 'Str', is => 'ro', default => sub { __DIR__ . '/hyphen.tex' }, ); has 'stopwords_file' => ( isa => 'Str', is => 'ro', default => sub { __DIR__ . '/stopwords' }, ); has 'debug' => ( isa => 'Bool', is => 'rw', required => 1, default => sub { 0 }, ); has 'hyphenator' => ( isa => 'TeX::Hyphen', is => 'ro', lazy => 1, builder => '_build_hyphenator', ); has 'stopwords' => ( isa => 'HashRef[Str]', is => 'ro', lazy => 1, builder => '_build_stopwords', ); has 'words' => ( isa => 'ArrayRef[Str]', is => 'ro', writer => '_set_words', default => sub { [] }, ); has 'word_indices' => ( isa => 'ArrayRef[Int]', is => 'ro', default => sub { [] }, writer => '_set_word_indices', ); =head1 METHODS =cut sub _build_hyphenator { my $self = shift; return TeX::Hyphen->new(file => $self->hyphen_file); } sub _build_stopwords { my $self = shift; my @stopwords; if (open my $sfh, $self->stopwords_file) { chomp(@stopwords = <$sfh>); close $sfh; } else { carp "Couldn't read stopwords file " . $self->stopwords_file . ' ' . $!; @stopwords = qw/a an and or but it in its It's it's the of you I i/; } return { map { lc($_) => 1 } @stopwords }; } =head2 meme($value) Method which sets / returns the current replacement meme. If called without additional arguments, it returns the current meme. Calling it with a scalar replaces the old meme with a new one. =head2 replace_freq($value) Getter/Setter Method for the replacement frequency. Value should be passed as a fractional value, which corresponds to the number of words considered for meme replacement via the following calculation: =head2 debug($value) Turn debugging output on (C<1>) or off (C<0>). Debugging output is printed to C. =cut # helpers sub is_stop_word { my ($self, $word) = @_; return exists $self->stopwords->{lc($word)}; } sub is_url { my ($self, $word) = @_; return $word =~ /^https?:\/\//i; } sub is_meme { my ($self, $word) = @_; return lc($word) eq lc($self->meme); } sub _split_preserving_whitespace { my ($self, $string) = @_; my ($leading_ws, $remainder) = ($string =~ m/^(\s*)(.*)$/s); $leading_ws = defined $leading_ws ? $leading_ws : ''; my @all_split = split(/(\s+)/, $remainder); my (@words, @ws); foreach my $tok (@all_split) { if ($tok =~ m/^\s+$/) { push @ws, $tok } else { push @words, $tok } } return ($leading_ws, \@words, \@ws); } sub _reassemble_with_whitespace { my ($self, $leading, $words, $ws) = @_; # interleave the two arrays. Words always come first, because # any leading space is in $leading. # http://www.perlmonks.org/?node_id=53605 # if things are different sizes we'll end up with some undefs, # so grep them out. my @ret = grep { defined } map { $words->[$_], $ws->[$_] } 0 .. ($#$words > $#$ws ? $#$words : $#$ws); # and convert back to a string. return $leading . join('', @ret); } =head2 buttify_string This method is the core of Butts.pm. It takes a string argument (or defaults to C<$_> if none is given, and returns a string in which random parts of words have been replaced with the contents of C<$self-Ememe>. The original whitespace of the string is preserved as far as possible. =cut sub buttify_string($_) { my $self = shift; # glom a string from $_ if we didn't get one passed. my $str = (@_ ? $_[0] : $_); chomp($str); # FIX for http://code.google.com/p/buttbot/issues/detail?id=7 my ($leading, $words, $whitespace) = $self->_split_preserving_whitespace($str); my @butted_words = $self->buttify(@$words); return $self->_reassemble_with_whitespace($leading, \@butted_words, $whitespace); } =head2 buttify(@words) Operates in a similar fashion to L, but should be passed a pre-tokenised array of words. It returns an array of equal length in which some portion of (some of) the tokens have been replaced by the meme in C<$self-Ememe>. =cut sub buttify { my ($self, @words) = @_; my $how_many_butts = int(@words * $self->replace_freq) + 1; my $debug = $self->debug; $self->_set_words(\@words); # sort indices by word length my @word_idxs_len_sorted = do { my $c; map { $_->[0] } sort { $b->[1] <=> $a->[1] } map { [$c++ , length($_) ] } @words; }; # remove stop words @word_idxs_len_sorted = grep { my $word = $words[$_]; my $is_word = $word !~ /^[\d\W+]+$/; my $is_stop = $self->is_stop_word($word); my $is_meme = $self->is_meme($word); my $is_url = $self->is_url($word); $is_word and not $is_stop and not $is_meme and not $is_url; } @word_idxs_len_sorted; $self->_set_word_indices(\@word_idxs_len_sorted); # bail out if we've got nothing left. This happens # when a string is comprised entirely of stop-words. unless (@word_idxs_len_sorted) { $self->log("Couldn't buttify ", join(' ', @words), ": entirely stopwords"); return @words; } # make sure we're not trying to butt too hard. if ($how_many_butts > @word_idxs_len_sorted) { $how_many_butts = scalar(@word_idxs_len_sorted); } $self->log("buttifying with $how_many_butts repetitions"); my $words_butted = {}; my @initial_weights = _sq_weight_indices(scalar @word_idxs_len_sorted); # Selecting words to butt works in the following way: # * each (non-stop) word-index is assigned a weighting based on it's # ordinal when sorted by (word) length. So, the longest word has weight = # num_words ** 2, second longest is (num_words-1)**2, ... # * A random distribution selects some index proportional to its weight. # * The word at this index is butted. # * The index is removed from consideration for subsequent buttings. for my $c (0 .. $how_many_butts-1) { my ($xx_n, $xx_p, $xx_x) = $self->_build_weightings_for_index(\@initial_weights, $words_butted); my $random_idx = get_walker_rand($xx_n, $xx_p, $xx_x); my $idx_to_butt = $word_idxs_len_sorted[$random_idx]; $self->log("Butting word idx: $idx_to_butt [", $words[$idx_to_butt], "]"); $words[$idx_to_butt] = $self->_buttsub($words[$idx_to_butt]); $words_butted->{$random_idx} = 1; } return @words; } sub _find_repeating_vowel { my ($self, $word) = @_; my $vowels = "aeiouAEIOU"; my $j = 0; my $j_record = 0; my $k_record = 0; while ($j < length($word)) { if (index($vowels, substr($word,$j,1)) > -1) { # $word[$j] is a vowel; how many times does it repeat? my $k = 0; do { ++$k; } while (($j + $k < length($word)) && (substr($word,$j+$k,1) eq substr($word,$j,1))); # save the vowel that repeats most if ($k > $k_record) { $j_record = $j; $k_record = $k; } } ++$j; } return ($j_record, $k_record); } sub _buttsub { my ($self, $word) = @_; my $meme = $self->meme; # split off leading and trailing punctuation my ($lp, $actual_word, $rp) = $word =~ /^([^A-Za-z]*)(.*?)([^A-Za-z]*)$/; return $word unless $actual_word; my @points = (0, $self->hyphenator->hyphenate($actual_word)); my $factor = 2; my $length = scalar @points; my $replace = $length - 1 - int(rand($length ** $factor) ** (1 / $factor)); push @points, length($actual_word); my $l = $points[$replace]; my $r = $points[$replace + 1] - $l; while (substr($actual_word, $l + $r, 1) eq 't') { $r++; } while ($l > 0 && substr($actual_word, $l - 1, 1) eq 'b') { $l--; } my $sub = substr($actual_word, $l, $r); my $butt = lc($meme); if ($sub eq uc $sub) { $butt = uc($meme); } elsif ($sub =~/^[A-Z]/) { $butt = ucfirst($meme); } my ($j, $k) = $self->_find_repeating_vowel($sub); if ($k > 2) { my $k2; ($j, $k2) = $self->_find_repeating_vowel($butt); substr($butt, $j, 1) = substr($butt, $j, 1) x $k; } substr($actual_word, $l, $r) = $butt; return $lp . $actual_word . $rp; } sub _build_weightings_for_index { my ($self, $initial_weights, $butted_indices) = @_; #$self->log("Word indices remaining: ", @indices); my $i = 0; if ($self->debug) { $self->log(Dumper($butted_indices)); $self->log(Dumper($initial_weights)); } my @idx_weights = map { exists($butted_indices->{$i++})?0:$_ } @$initial_weights; my $str; $i = 0; for (@{$self->word_indices}) { $str .= "\tIndex: $_: " . $self->words->[$_] . " ,weight=" . $idx_weights[$i++] . "\n"; } $self->log("index weightings:\n" . $str); my ($n, $p, $x) = setup_walker_rand(\@idx_weights); return ($n, $p, $x) } sub _sq_weight_indices { my $max = shift; return map { $max-- ** 2 } (0..$max-1); } # stealed from http://code.activestate.com/recipes/576564/ # and http://prxq.wordpress.com/2006/04/17/the-alias-method/ # Copyright someone maybe somewhere? sub setup_walker_rand { my ($weight_ref) = @_; my @weights = @$weight_ref; my $n = scalar @weights; my @in_x = (-1) x $n; my $sum_w = 0; $sum_w += $_ for @weights; # normalise weights to have an average value of 1. @weights = map { $_ * $n / $sum_w } @weights; my (@short, @long); my $i = 0; # split into long and short groups (excluding those which == 1) for my $p (@weights) { if ($p < 1) { push @short, $i; } elsif ($p > 1) { push @long, $i; } $i++; } # build alias map by combining short and long elements. while (scalar @short and scalar @long) { my $j = pop @short; my $k = $long[-1]; $in_x[$j] = $k; $weights[$k] -= (1 - $weights[$j]); if ($weights[$k] < 1) { push @short, $k; pop @long; } # printf("test: j=%d k=%d pk=%.2f\n", $j, $k, $prob[$k]); } return ($n, \@weights, \@in_x) } sub get_walker_rand { my ($n, $prob, $in_x) = @_; my ($u, $j); $u = random_uniform(1,0,1); $j = random_uniform_integer(1, 0, $n-1); return ($u <= $prob->[$j]) ? $j : $in_x->[$j]; } sub log { my ($self, @msg) = @_; if ($self->debug) { print STDERR join(" ", @msg) . $/; } } no Moose; __PACKAGE__->meta->make_immutable; }