Text-Affixes-0.07/0000755000076500007650000000000010337675634013313 5ustar jacjac00000000000000Text-Affixes-0.07/Affixes.pm0000644000076500007650000001205510337675523015236 0ustar jacjac00000000000000package Text::Affixes; use 5.006; use strict; use warnings; require Exporter; use AutoLoader qw(AUTOLOAD); our @ISA = qw(Exporter); our %EXPORT_TAGS = ( 'all' => [ qw( ) ] ); our @EXPORT_OK = ( @{ $EXPORT_TAGS{'all'} } ); our @EXPORT = qw( get_prefixes get_suffixes ); our $VERSION = '0.07'; =head1 NAME Text::Affixes - Prefixes and suffixes analisys of text =head1 SYNOPSIS use Text::Affixes; my $text = "Hello, world. Hello, big world."; my $prefixes = get_prefixes($text); # $prefixes now holds # { # 3 => { # 'Hel' => 2, # 'wor' => 2, # } # } # or $prefixes = get_prefixes({min => 1, max => 2},$text); # $prefixes now holds # { # 1 => { # 'H' => 2, # 'w' => 2, # 'b' => 1, # }, # 2 => { # 'He' => 2, # 'wo' => 2, # 'bi' => 1, # } # } # the use for get_suffixes is similar =head1 DESCRIPTION Provides methods for prefixe and suffix analisys of text. =head1 METHODS =head2 get_prefixes Extracts prefixes from text. You can specify the minimum and maximum number of characters of prefixes you want. Returns a reference to a hash, where the specified limits are mapped in hashes; each of those hashes maps every prefix in the text into the number of times it was found. By default, both minimum and maximum limits are 3. If the minimum limit is greater than the lower one, an empty hash is returned. A prefix is considered to be a sequence of word characters (\w) in the beginning of a word (that is, after a word boundary) that does not reach the end of the word ("regular expressionly", a prefix is the $1 of /\b(\w+)\w/). # extracting prefixes of size 3 $prefixes = get_prefixes( $text ); # extracting prefixes of sizes 2 and 3 $prefixes = get_prefixes( {min => 2}, $text ); # extracting prefixes of sizes 3 and 4 $prefixes = get_prefixes( {max => 4}, $text ); # extracting prefixes of sizes 2, 3 and 4 $prefixes = get_prefixes( {min => 2, max=> 4}, $text); =cut sub get_prefixes { return _get_elements(1,@_); } =head2 get_suffixes The get_suffixes function is similar to the get_prefixes one. You should read the documentation for that one and than come back to this point. A suffix is considered to be a sequence of word characters (\w) in the end of a word (that is, before a word boundary) that does not start at the beginning of the word ("regular expressionly" speaking, a prefix is the $1 of /\w(\w+)\b/). # extracting suffixes of size 3 $suffixes = get_suffixes( $text ); # extracting suffixes of sizes 2 and 3 $suffixes = get_suffixes( {min => 2}, $text ); # extracting suffixes of sizes 3 and 4 $suffixes = get_suffixes( {max => 4}, $text ); # extracting suffixes of sizes 2, 3 and 4 $suffixes = get_suffixes( {min => 2, max=> 4}, $text); =cut sub get_suffixes { return _get_elements(0,@_); } sub _get_elements { my $task = shift; =head1 OPTIONS Apart from deciding on a minimum and maximum size for prefixes or suffixes, you can also decide on some configuration options. =cut # configuration my %conf = ( min => 3, max => 3, exclude_numbers => 1, lowercase => 0, ); if (ref $_[0] eq 'HASH') { %conf = (%conf, %{+shift}); } return {} if $conf{max} < $conf{min}; # get the elements my %elements; my $text = shift || return undef; $conf{min} = 1 if $conf{min} < 1; for ($conf{min} .. $conf{max}) { my $regex = $task ? qr/\b(\w{$_})\w/ : # prefixes qr/\w(\w{$_})\b/ ; # suffixes while ($text =~ /$regex/g) { $elements{$_}{$1}++; } } =head2 exclude_numbers Set to 0 if you consider numbers as part of words. Default value is 1. # this get_suffixes( {min => 1, max => 1, exclude_numbers => 0}, "Hello, but w8" ); # returns this: { 1 => { 'o' => 1, 't' => 1, '8' => 1 } } =cut # exclude elements containing numbers if ($conf{exclude_numbers}) { for my $s (keys %elements) { for (keys %{$elements{$s}}) { delete ${$elements{$s}}{$_} if /\d/; } } } =head2 lowercase Set to 1 to extract all prefixes in lowercase mode. Default value is 0. ATTENTION: This does not mean that prefixes with uppercased characters won't be extracted. It means they will be extracted after being lowercased. # this... get_prefixes( {min => 2, max => 2, lowercase => 1}, "Hello, hello"); # returns this: { 2 => { 'he' => 2 } } =cut # elements containing uppercased characters become lowercased ones if ($conf{lowercase}) { for my $s (keys %elements) { for (keys %{$elements{$s}}) { if (/[A-Z]/) { ${$elements{$s}}{lc $_} += ${$elements{$s}}{$_}; delete ${$elements{$s}}{$_}; } } } } return \%elements; } 1; __END__ =head1 TO DO =over 6 =item * Make it more efficient (use C for that) =back =head1 AUTHOR Jose Castro, C<< >> =head1 COPYRIGHT & LICENSE Copyright 2004 Jose Castro, All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the same terms as Perl itself. =cut Text-Affixes-0.07/Changes0000644000076500007650000000163110337675565014612 0ustar jacjac00000000000000Revision history for Text-Affixes 0.07 Sat Nov 19 19:22:00 2005 - increased test coverage 0.06 Fri Nov 05 00:17:14 2004 - added parameter "exclude_numbers" to the configuration - added parameter "lowercase" to the configuration - code for extracting prefixes and suffixes now resides in the same private function 0.05 Mon Nov 01 15:18:00 2004 - Perl required version is now 5.006 - changes in the documentation - added pod.t and pod-coverage.t to the tests directory - when minimum size is bigger than maximum size, a reference to an empty hash is automatically returned 0.04 Tue Oct 26 19:20:00 2004 - corrected grammar (it's "suffix", not "sufix"; duh!) 0.03 Tue Oct 26 00:00:10 2004 - minor changes in the documentation - added tests 0.02 Sat Oct 23 17:04:20 2004 - added README 0.01 Sat Oct 23 17:04:20 2004 - original version; created by h2xs 1.22 with options -Xan Text::Affixes Text-Affixes-0.07/Makefile.PL0000644000076500007650000000100710141432713015241 0ustar jacjac00000000000000use 5.006; use ExtUtils::MakeMaker; # See lib/ExtUtils/MakeMaker.pm for details of how to influence # the contents of the Makefile that is written. WriteMakefile( 'NAME' => 'Text::Affixes', 'VERSION_FROM' => 'Affixes.pm', # finds $VERSION 'PREREQ_PM' => {}, # e.g., Module::Name => 1.1 ($] >= 5.005 ? ## Add these new keywords supported since 5.005 (ABSTRACT_FROM => 'Affixes.pm', # retrieve abstract from module AUTHOR => 'Jose Castro ', ) : ()), ); Text-Affixes-0.07/MANIFEST0000644000076500007650000000031410337675625014442 0ustar jacjac00000000000000Affixes.pm Changes Makefile.PL MANIFEST README t/01-prefixes.t t/02-suffixes.t t/03-extreme_cases.t t/pod.t t/pod-coverage.t META.yml Module meta-data (added by MakeMaker) Text-Affixes-0.07/META.yml0000644000076500007650000000045310337675634014566 0ustar jacjac00000000000000# http://module-build.sourceforge.net/META-spec.html #XXXXXXX This is a prototype!!! It will change in the future!!! XXXXX# name: Text-Affixes version: 0.07 version_from: Affixes.pm installdirs: site requires: distribution_type: module generated_by: ExtUtils::MakeMaker version 6.17 Text-Affixes-0.07/README0000644000076500007650000000712010337675531014167 0ustar jacjac00000000000000Affixes.pm version 0.07 ======================= =head1 NAME Text::Affixes - Prefixes and suffixes analisys of text =head1 SYNOPSIS use Text::Affixes; my $text = "Hello, world. Hello, big world."; my $prefixes = get_prefixes($text); # $prefixes now holds # { # 3 => { # 'Hel' => 2, # 'wor' => 2, # } # } # or $prefixes = get_prefixes({min => 1, max => 2},$text); # $prefixes now holds # { # 1 => { # 'H' => 2, # 'w' => 2, # 'b' => 1, # }, # 2 => { # 'He' => 2, # 'wo' => 2, # 'bi' => 1, # } # } # the use for get_suffixes is similar =head1 DESCRIPTION Provides methods for prefixe and suffix analisys of text. =head1 METHODS =head2 get_prefixes Extracts prefixes from text. You can specify the minimum and maximum number of characters of prefixes you want. Returns a reference to a hash, where the specified limits are mapped in hashes; each of those hashes maps every prefix in the text into the number of times it was found. By default, both minimum and maximum limits are 3. If the minimum limit is greater than the lower one, an empty hash is returned. A prefix is considered to be a sequence of word characters (\w) in the beginning of a word (that is, after a word boundary) that does not reach the end of the word ("regular expressionly", a prefix is the $1 of /\b(\w+)\w/). # extracting prefixes of size 3 $prefixes = get_prefixes( $text ); # extracting prefixes of sizes 2 and 3 $prefixes = get_prefixes( {min => 2}, $text ); # extracting prefixes of sizes 3 and 4 $prefixes = get_prefixes( {max => 4}, $text ); # extracting prefixes of sizes 2, 3 and 4 $prefixes = get_prefixes( {min => 2, max=> 4}, $text); =head2 get_suffixes The get_suffixes function is similar to the get_prefixes one. You should read the documentation for that one and than come back to this point. A suffix is considered to be a sequence of word characters (\w) in the end of a word (that is, before a word boundary) that does not start at the beginning of the word ("regular expressionly" speaking, a prefix is the $1 of /\w(\w+)\b/). # extracting suffixes of size 3 $suffixes = get_suffixes( $text ); # extracting suffixes of sizes 2 and 3 $suffixes = get_suffixes( {min => 2}, $text ); # extracting suffixes of sizes 3 and 4 $suffixes = get_suffixes( {max => 4}, $text ); # extracting suffixes of sizes 2, 3 and 4 $suffixes = get_suffixes( {min => 2, max=> 4}, $text); =head1 OPTIONS Apart from deciding on a minimum and maximum size for prefixes or suffixes, you can also decide on some configuration options. =head2 exclude_numbers Set to 0 if you consider numbers as part of words. Default value is 1. # this get_suffixes( {min => 1, max => 1, exclude_numbers => 0}, "Hello, but w8" ); # returns this: { 1 => { 'o' => 1, 't' => 1, '8' => 1 } } =head2 lowercase Set to 1 to extract all prefixes in lowercase mode. Default value is 0. ATTENTION: This does not mean that prefixes with uppercased characters won't be extracted. It means they will be extracted after being lowercased. # this... get_prefixes( {min => 2, max => 2, lowercase => 1}, "Hello, hello"); # returns this: { 2 => { 'he' => 2 } } =head1 TO DO =over 6 =item * Make it more efficient (use C for that) =back =head1 AUTHOR Jose Castro, C<< >> =head1 COPYRIGHT & LICENSE Copyright 2004 Jose Castro, All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the same terms as Perl itself. Text-Affixes-0.07/t/0000755000076500007650000000000010337675634013556 5ustar jacjac00000000000000Text-Affixes-0.07/t/01-prefixes.t0000644000076500007650000000207310142541617015773 0ustar jacjac00000000000000use Test::More tests => 8; BEGIN { use_ok('Text::Affixes') }; my $text = "Hello, world. Hello, big world."; is_deeply( get_prefixes($text), { 3 => { 'Hel' => 2, 'wor' => 2, } } ); is_deeply( get_prefixes({min => 1, max => 2},$text), { 1 => { 'H' => 2, 'w' => 2, 'b' => 1, }, 2 => { 'He' => 2, 'wo' => 2, 'bi' => 1, } } ); $text = "Hello1, 2world"; is_deeply( get_prefixes({min => 2, max => 2}, $text), { 2 => { 'He' => 1, } } ); is_deeply( get_prefixes({min => 2, max => 2, exclude_numbers => 0}, $text), { 2 => { 'He' => 1, '2w' => 1, } } ); is_deeply( get_prefixes({min => 2, max => 2, lowercase => 1}, $text), { 2 => { 'he' => 1, } } ); $text = "Hello, hello"; is_deeply( get_prefixes({min => 2, max => 2, lowercase => 1}, $text), { 2 => { 'he' => 2, } } ); is_deeply( get_prefixes({min => 2, max => 2, lowercase => 0}, $text), { 2 => { 'He' => 1, 'he' => 1, } } ); Text-Affixes-0.07/t/02-suffixes.t0000644000076500007650000000206710142543124016001 0ustar jacjac00000000000000use Test::More tests => 8; BEGIN { use_ok('Text::Affixes') }; my $text = "Hello, world. Hello, big world."; is_deeply( get_suffixes($text), { 3 => { 'llo' => 2, 'rld' => 2, } }); is_deeply( get_suffixes({min => 1, max => 2},$text), { 1 => { 'o' => 2, 'd' => 2, 'g' => 1, }, 2 => { 'lo' => 2, 'ld' => 2, 'ig' => 1, } }); $text = "Hello1, 2worlD"; is_deeply( get_suffixes({min => 2, max => 2}, $text), { 2 => { 'lD' => 1, } } ); is_deeply( get_suffixes({min => 2, max => 2, exclude_numbers => 0}, $text), { 2 => { 'lD' => 1, 'o1' => 1, } } ); is_deeply( get_suffixes({min => 2, max => 2, lowercase => 1}, $text), { 2 => { 'ld' => 1, } } ); $text = "Hello, hellO"; is_deeply( get_suffixes({min => 2, max => 2, lowercase => 1}, $text), { 2 => { 'lo' => 2, } } ); is_deeply( get_suffixes({min => 2, max => 2, lowercase => 0}, $text), { 2 => { 'lo' => 1, 'lO' => 1, } } ); Text-Affixes-0.07/t/03-extreme_cases.t0000644000076500007650000000103410337675433017005 0ustar jacjac00000000000000use Test::More tests => 5; BEGIN { use_ok('Text::Affixes') }; my $text = "Hello, world. Hello, big world."; my $prefixes = get_prefixes( {}, $text); is_deeply( $prefixes , # $prefixes now holds { 3 => { 'Hel' => 2, 'wor' => 2, } }); $prefixes = get_prefixes({min => 0, max => 0},$text); is_deeply( $prefixes , # $prefixes now holds { }); $prefixes = get_prefixes({min => 4, max => 3},$text); is_deeply( $prefixes , # $prefixes now holds { }); is_deeply( get_prefixes(), undef ); Text-Affixes-0.07/t/pod-coverage.t0000644000076500007650000000025410141433001016264 0ustar jacjac00000000000000#!perl -T use Test::More; eval "use Test::Pod::Coverage 1.04"; plan skip_all => "Test::Pod::Coverage 1.04 required for testing POD coverage" if $@; all_pod_coverage_ok(); Text-Affixes-0.07/t/pod.t0000644000076500007650000000021410141433001014467 0ustar jacjac00000000000000#!perl -T use Test::More; eval "use Test::Pod 1.14"; plan skip_all => "Test::Pod 1.14 required for testing POD" if $@; all_pod_files_ok();