Text-Context-3.7/0000700000076500000240000000000011232015340012562 5ustar tonystaffText-Context-3.7/Changes0000644000076500000240000000141111232015253014067 0ustar tonystaffRevision history for Perl extension Text::Context. 3.7 Thu Jul 23 11:08:38 EEST 2009 - Add dependency on HTML::Entities (Jonathan Yu) 3.6 Tue Sep 6 21:51:11 UTC 2005 - Doc changes 3.5 Fri Oct 22 16:34:16 UTC 2004 - Tony Bowden is now maintainer - split Text::Context::Para to its own file - auto-require the Paragraph class 3.4 Sat May 1 11:57:17 2004 - Add dependency 3.3 Mon Oct 20 16:38:26 2003 - fixed a bug where terms would be marked-up overzealously. 3.2 Mon Oct 20 15:31:43 2003 - scoretable should not barf when there are no scores! 3.1 Mon Oct 20 15:01:22 2003 - Handle entity translation during to_html 3.0 Tue Apr 1 16:06:46 2003 - This is a complete rewrite, although the interface is mainly the same. Text-Context-3.7/lib/0000700000076500000240000000000011232015340013330 5ustar tonystaffText-Context-3.7/lib/Text/0000700000076500000240000000000011232015340014254 5ustar tonystaffText-Context-3.7/lib/Text/Context/0000700000076500000240000000000011232015340015700 5ustar tonystaffText-Context-3.7/lib/Text/Context/Para.pm0000644000076500000240000000377510307407475017167 0ustar tonystaffpackage Text::Context::Para; =head1 NAME Text::Context::Para - A paragraph in context =head1 DESCRIPTION This is a paragraph being used by Text::Context. =cut use strict; use warnings; use HTML::Entities; use Text::Context::EitherSide qw(get_context); use constant DEFAULT_START_TAG => ''; use constant DEFAULT_END_TAG => ""; =head1 CONSTRUCTOR =head2 new my $para = Text::Context::Para->new($content, $order); =cut sub new { my ($class, $content, $order) = @_; return bless { content => $content, scoretable => [], marked_words => [], final_score => 0, order => $order }, $class; } =head1 METHODS =head2 best_keywords / slim =head2 as_text / marked_up You can override DEFAULT_START_TAG and DEFAULT_END_TAG. These default to and =cut sub best_keywords { my $self = shift; return @{ $self->{scoretable}->[-1] || [] }; } sub slim { my ($self, $max_weight) = @_; $self->{content} =~ s/^\s+//; $self->{content} =~ s/\s+$//; return $self if length $self->{content} <= $max_weight; my @words = split /\s+/, $self->{content}; for (reverse(0 .. @words / 2)) { my $trial = get_context($_, $self->{content}, @{ $self->{marked_words} }); if (length $trial < $max_weight) { $self->{content} = $trial; return $self; } } $self->{content} = join " ... ", @{ $self->{marked_words} }; return $self; # Should not happen. } sub as_text { return $_[0]->{content} } sub marked_up { my $self = shift; my $start_tag = shift || DEFAULT_START_TAG; my $end_tag = shift || DEFAULT_END_TAG; my $content = $self->as_text; # Need to escape entities in here. my $re = join "|", map { qr/\Q$_\E/i } @{ $self->{marked_words} }; my $re2 = qr/\b($re)\b/i; my @fragments = split /$re2/i, $content; my $output; for my $orig_frag (@fragments) { my $frag = encode_entities($orig_frag); if ($orig_frag =~ /$re2/i) { $frag = $start_tag . $frag . $end_tag; } $output .= $frag; } return $output; } 1; Text-Context-3.7/lib/Text/Context.pm0000644000076500000240000001425611232015174016265 0ustar tonystaffpackage Text::Context; use strict; use warnings; use UNIVERSAL::require; our $VERSION = "3.7"; =head1 NAME Text::Context - Handle highlighting search result context snippets =head1 SYNOPSIS use Text::Context; my $snippet = Text::Context->new($text, @keywords); $snippet->keywords("foo", "bar"); # In case you change your mind print $snippet->as_html; print $snippet->as_text; =head1 DESCRIPTION Given a piece of text and some search terms, produces an object which locates the search terms in the message, extracts a reasonable-length string containing all the search terms, and optionally dumps the string out as HTML text with the search terms highlighted in bold. =head2 new Creates a new snippet object for holding and formatting context for search terms. =cut sub new { my ($class, $text, @keywords) = @_; my $self = bless { text => $text, keywords => [] }, $class; $self->keywords(@keywords); return $self; } =head2 keywords Accessor method to get/set keywords. As the context search is done case-insensitively, the keywords will be lower-cased. =cut sub keywords { my ($self, @keywords) = @_; $self->{keywords} = [ map { s/\s+/ /g; lc $_ } @keywords ] if @keywords; return @{ $self->{keywords} }; } =begin maintenance =head2 prepare_text Turns the text into a set of Paragraph objects, collapsing multiple spaces in the text and feeding the paragraphs, in order, onto the C member. =head2 para_class The Paragraph class to use. This defaults to 'Text::Context::Para' =end maintenance =cut sub para_class { "Text::Context::Para" } sub prepare_text { my $self = shift; my @paras = split /\n\n/, $self->{text}; for (0 .. $#paras) { my $x = $paras[$_]; $x =~ s/\s+/ /g; $self->para_class->require; push @{ $self->{text_a} }, $self->para_class->new($x, $_); } } =begin maintenance =head2 permute_keywords This is very clever. To determine which keywords "apply" to a given paragraph, we first produce a set of all possible keyword sets. For instance, given "a", "b" and "c", we want to produce a b c a b a c a b c b c We do this by counting in binary, and then mapping the counts onto keywords. =end maintenance =cut sub permute_keywords { my $self = shift; my @permutation; for my $bitstring (1 .. (2**@{ $self->{keywords} }) - 1) { my @thisperm; for my $bitmask (0 .. @{ $self->{keywords} } - 1) { push @thisperm, $self->{keywords}[$bitmask] if $bitstring & 2**$bitmask; } push @permutation, \@thisperm; } return reverse @permutation; } =begin maintenance =head2 score_para / get_appropriate_paras Now we want to find a "score" for this paragraph, finding the best set of keywords which "apply" to it. We favour keyword sets which have a large number of matches (obviously a paragraph is better if it matches "a" and "c" than if it just matches "a") and with multi-word keywords. (A paragraph which matches "fresh cheese sandwiches" en bloc is worth picking out, even if it has no other matches.) =end maintenance =cut sub score_para { my ($self, $para) = @_; my $content = $para->{content}; my %matches; # Do all the matching of keywords in advance of the boring # permutation bit for my $word (@{ $self->{keywords} }) { my $word_score = 0; $word_score += 1 + ($content =~ tr/ / /) if $content =~ /\b\Q$word\E\b/i; $matches{$word} = $word_score; } #XXX : Possible optimization: Give up if there are no matches for my $wordset ($self->permute_keywords) { my $this_score = 0; $this_score += $matches{$_} for @$wordset; $para->{scoretable}[$this_score] = $wordset if $this_score > @$wordset; } $para->{final_score} = $#{ $para->{scoretable} }; } sub _set_intersection { my %union; my %isect; for (@_) { $union{$_}++ && ($isect{$_} = $_) } return values %isect; } sub _set_difference { my ($a, $b) = @_; my %seen; @seen{@$b} = (); return grep { !exists $seen{$_} } @$a; } sub get_appropriate_paras { my $self = shift; my @app_paras; my @keywords = @{ $self->{keywords} }; my @paras = sort { $b->{final_score} <=> $a->{final_score} } @{ $self->{text_a} }; for my $para (@paras) { my @words = _set_intersection($para->best_keywords, @keywords); if (@words) { @keywords = _set_difference(\@keywords, \@words); $para->{marked_words} = \@words; push @app_paras, $para; last if !@keywords; } } $self->{app_paras} = [ sort { $a->{order} <=> $b->{order} } @app_paras ]; return @{ $self->{app_paras} }; } =head2 paras @paras = $self->paras($maxlen) Return shortened paragraphs to fit together into a snippet of at most C<$maxlen> characters. =cut sub paras { my $self = shift; my $max_len = shift || 80; $self->prepare_text; $self->score_para($_) for @{ $self->{text_a} }; my @paras = $self->get_appropriate_paras; return unless @paras; # XXX: Algorithm may get better here by considering number of marked # up words as weight return map { $_->slim($max_len / @paras) } $self->get_appropriate_paras; } =head2 as_text Calculates a "representative" string which contains the given search terms. If there's lots and lots of context between the terms, it's replaced with an ellipsis. =cut sub as_text { return join " ... ", map { $_->as_text } $_[0]->paras; } =head2 as_html([ start => "", end => "" ]) Markup the snippet as a HTML string using the specified delimiters or with a default set of delimiters (Cspan class="quoted"E>). =cut sub as_html { my $self = shift; my %args = @_; my ($start, $end) = @args{qw(start end)}; return join " ... ", map { $_->marked_up($start, $end) } $self->paras; } =head1 AUTHOR Original author: Simon Cozens Current maintainer: Tony Bowden =head1 BUGS and QUERIES Please direct all correspondence regarding this module to: bug-Text-Context@rt.cpan.org =head1 COPYRIGHT AND LICENSE Copyright (C) 2002-2005 Kasei This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. =cut 1; Text-Context-3.7/Makefile.PL0000644000076500000240000000043211232015151014545 0ustar tonystaffuse ExtUtils::MakeMaker; WriteMakefile( 'NAME' => 'Text::Context', 'VERSION_FROM' => 'lib/Text/Context.pm', 'PREREQ_PM' => { 'Text::Context::EitherSide' => 1.1, 'UNIVERSAL::require' => 0.03, 'HTML::Entities' => 0, }, ); Text-Context-3.7/MANIFEST0000644000076500000240000000026310307407606013742 0ustar tonystaffChanges lib/Text/Context.pm lib/Text/Context/Para.pm Makefile.PL MANIFEST META.yml Module meta-data (added by MakeMaker) README t/1.t t/2.t t/3.t t/4.t t/pod-coverage.t t/pod.t Text-Context-3.7/META.yml0000600000076500000240000000070411232015340014036 0ustar tonystaff--- #YAML:1.0 name: Text-Context version: 3.7 abstract: ~ license: ~ author: ~ generated_by: ExtUtils::MakeMaker version 6.44 distribution_type: module requires: HTML::Entities: 0 Text::Context::EitherSide: 1.1 UNIVERSAL::require: 0.03 meta-spec: url: http://module-build.sourceforge.net/META-spec-v1.3.html version: 1.3 Text-Context-3.7/README0000644000076500000240000000377410307410120013464 0ustar tonystaffNAME Text::Context - Handle highlighting search result context snippets SYNOPSIS use Text::Context; my $snippet = Text::Context->new($text, @keywords); $snippet->keywords("foo", "bar"); # In case you change your mind print $snippet->as_html; print $snippet->as_text; DESCRIPTION Given a piece of text and some search terms, produces an object which locates the search terms in the message, extracts a reasonable-length string containing all the search terms, and optionally dumps the string out as HTML text with the search terms highlighted in bold. new Creates a new snippet object for holding and formatting context for search terms. keywords Accessor method to get/set keywords. As the context search is done case-insensitively, the keywords will be lower-cased. paras @paras = $self->paras($maxlen) Return shortened paragraphs to fit together into a snippet of at most $maxlen characters. as_text Calculates a "representative" string which contains the given search terms. If there's lots and lots of context between the terms, it's replaced with an ellipsis. as_html([ start => "", end => "" ]) Markup the snippet as a HTML string using the specified delimiters or with a default set of delimiters (""). AUTHOR Original author: Simon Cozens Current maintainer: Tony Bowden BUGS and QUERIES Please direct all correspondence regarding this module to: bug-Text-Context@rt.cpan.org COPYRIGHT AND LICENSE Copyright (C) 2002-2005 Kasei This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. Text-Context-3.7/t/0000700000076500000240000000000011232015340013025 5ustar tonystaffText-Context-3.7/t/1.t0000644000076500000240000000242307744765001013410 0ustar tonystaffuse Test::More tests => 15; use_ok "Text::Context"; # Unit tests for the Ruby port my $s = Text::Context->new("This is a test\n\nAnd so is this.\n\nbut this has more words than the others", "TeSt", "ThiS", "more words"); isa_ok($s, "Text::Context"); is_deeply([$s->keywords], ["test", "this", "more words"], "Keywords downcase properly"); $s->prepare_text; my @things = @{$s->{text_a}}; is @things, 3, "Proper number of paras"; for (@things) { isa_ok $_, "Text::Context::Para" } is $things[0]->as_text, "This is a test", "Text maintained OK"; for (@things) { $s->score_para($_) } is $things[0]->{final_score}, 8, "Score is OK (first para)"; is_deeply [$things[0]->best_keywords], ["test", "this"], "Keywords OK"; is $things[-1]->{final_score}, 16, "Score is OK (last para)"; $s->get_appropriate_paras; my @paras = @{$s->{app_paras}}; is(@paras,2, "We selected two paragraphs"); is_deeply([map{$_->{order}}@paras],[0,2],"We selected the correct paras"); is($paras[0]->marked_up, 'This is a test', "Can mark self up"); $s = Text::Context->new("This is a test\n\nAnd so is this.\n\nbut this has more words than the others", "TeSt", "ThiS", "more words"); is($s->as_text, "This is a test ... but this has more words than the others", "Simple test passed"); Text-Context-3.7/t/2.t0000644000076500000240000000654407733263405013420 0ustar tonystaffuse strict; use warnings; use Test::More tests => 9; use Text::Context; undef $/; my $text = ; { my $snippet = Text::Context->new($text); isa_ok($snippet, "Text::Context"); $snippet->keywords(qw(Wadler XQuery)); is(join (" ", $snippet->keywords), "wadler xquery", "Keywords can be set (and are l/c'ed)"); } { my $snippet = Text::Context->new($text, "Wadler", "XQuery"); isa_ok($snippet, "Text::Context"); is( join (" ", $snippet->keywords), "wadler xquery", "Keywords can be set in constructor and retrieved (and are l/c'ed)" ); } { my $snippet = Text::Context->new($text, "Wadler", "XQuery"); my $expected = "... quoting Phil Wadler, who recently ... " . "lecture about XQuery said that ..."; is($snippet->as_text, $expected, "...and the text is correct"); } my $snippet = Text::Context->new($text, "Wadler", "XQuery"); my $expected = "... quoting Phil Wadler, who recently ... " . "lecture about XQuery said that ..."; is($snippet->as_html(start => "", end => ""), $expected, "as_html can take custom delimiters"); $expected =~ s///g; $expected =~ s/<\/B>/<\/span>/g; is($snippet->as_html(), $expected, "as_html uses span as default delimiters"); { my $snippet = Text::Context->new($text, "functional language"); is($snippet->as_text, '... XSLT is considered to be a functional language by experts in > this > ...', "and the text is correct", ); $snippet = Text::Context->new($text, "functional language"); $snippet->keywords("functional", "language"); is($snippet->as_text, '... XSLT is considered to be a functional language by experts in > this > ...', "and the text is correct", ); } 1; __DATA__ --- bryan wrote: > > >While XSLT is considered to be a functional language by experts in > this > >field, it is definitely not a very nice representative of this class > of > >programming languages. > > OOOOH that's a baaad thing you said. :) I'm just quoting Phil Wadler, who recently (at the School of Advanced FP in Oxford, England, August) in his lecture about XQuery said that "XSLT is probably the most used functional language and the ugliest one". > > Anyway, it seems to me that you prefer Haskell out of the various > functional languages, do you have a particular reason for this? I > have > problems with Haskell, I've tried and I've tried but it's frankly > quite > hard for me to follow programs written in Haskell once they get > beyond > a > couple pages when printed, for functional languages I prefer Lisp and > Erlang. Especially Erlang. > > So anyway what do you like especially about Haskell? Strong typing, polymorphic types, type classes Higher order functions Huge expressiveness Lazy evaluation + pattern matching The (built-in support for the) very precise (monadic) approach to encapsulating operations with side effects. They even joke that once you have specified the types correctly, then the solution just starts working... :o) and in reality quite often this is really the case. But I'm not comparing Haskell to other languages, just saying that I like it. ===== Cheers, Dimitre Novatchev. http://fxsl.sourceforge.net/ -- the home of FXSL __________________________________________________ Do you Yahoo!? Faith Hill - Exclusive Performances, Videos & More http://faith.yahoo.com XSL-List info and archive: http://www.mulberrytech.com/xsl/xsl-list Text-Context-3.7/t/3.t0000644000076500000240000000062607744765252013425 0ustar tonystaff#!/usr/bin/perl use Text::Context; use Test::More 'no_plan'; # We want to test snippeting of things which should be marked up as HTML # entities. my $s = Text::Context->new(' find s&z ', "me", "s&z"); my $output = '<html> find <me> s&z </html>'; is($s->as_html, $output, "entities are handled correctly"); Text-Context-3.7/t/4.t0000644000076500000240000000037707744770661013431 0ustar tonystaffuse Text::Context; use Test::More qw(no_plan); # What if the terms aren't there are all? my $tc = Text::Context->new("Re: Defect in XBD lround", "+44 118 9508311 ext 2250", "+44 118 9500110", "josey"); is($#{[$tc->paras]}, -1, "... and it doesn't die"); Text-Context-3.7/t/pod-coverage.t0000644000076500000240000000024110307406702015604 0ustar tonystaffuse Test::More; eval "use Test::Pod::Coverage 1.00"; plan skip_all => "Test::Pod::Coverage 1.00 required for testing POD coverage" if $@; all_pod_coverage_ok(); Text-Context-3.7/t/pod.t0000644000076500000240000000020110307406702014007 0ustar tonystaffuse Test::More; eval "use Test::Pod 1.00"; plan skip_all => "Test::Pod 1.00 required for testing POD" if $@; all_pod_files_ok();