Text-Context-3.7/ 0000700 0000765 0000024 00000000000 11232015340 012562 5 ustar tony staff Text-Context-3.7/Changes 0000644 0000765 0000024 00000001411 11232015253 014067 0 ustar tony staff Revision history for Perl extension Text::Context.
3.7 Thu Jul 23 11:08:38 EEST 2009
- Add dependency on HTML::Entities (Jonathan Yu)
3.6 Tue Sep 6 21:51:11 UTC 2005
- Doc changes
3.5 Fri Oct 22 16:34:16 UTC 2004
- Tony Bowden is now maintainer
- split Text::Context::Para to its own file
- auto-require the Paragraph class
3.4 Sat May 1 11:57:17 2004
- Add dependency
3.3 Mon Oct 20 16:38:26 2003
- fixed a bug where terms would be marked-up overzealously.
3.2 Mon Oct 20 15:31:43 2003
- scoretable should not barf when there are no scores!
3.1 Mon Oct 20 15:01:22 2003
- Handle entity translation during to_html
3.0 Tue Apr 1 16:06:46 2003
- This is a complete rewrite, although the interface is mainly
the same.
Text-Context-3.7/lib/ 0000700 0000765 0000024 00000000000 11232015340 013330 5 ustar tony staff Text-Context-3.7/lib/Text/ 0000700 0000765 0000024 00000000000 11232015340 014254 5 ustar tony staff Text-Context-3.7/lib/Text/Context/ 0000700 0000765 0000024 00000000000 11232015340 015700 5 ustar tony staff Text-Context-3.7/lib/Text/Context/Para.pm 0000644 0000765 0000024 00000003775 10307407475 017167 0 ustar tony staff package Text::Context::Para;
=head1 NAME
Text::Context::Para - A paragraph in context
=head1 DESCRIPTION
This is a paragraph being used by Text::Context.
=cut
use strict;
use warnings;
use HTML::Entities;
use Text::Context::EitherSide qw(get_context);
use constant DEFAULT_START_TAG => '';
use constant DEFAULT_END_TAG => "";
=head1 CONSTRUCTOR
=head2 new
my $para = Text::Context::Para->new($content, $order);
=cut
sub new {
my ($class, $content, $order) = @_;
return bless {
content => $content,
scoretable => [],
marked_words => [],
final_score => 0,
order => $order
}, $class;
}
=head1 METHODS
=head2 best_keywords / slim
=head2 as_text / marked_up
You can override DEFAULT_START_TAG and DEFAULT_END_TAG. These default to
and
=cut
sub best_keywords {
my $self = shift;
return @{ $self->{scoretable}->[-1] || [] };
}
sub slim {
my ($self, $max_weight) = @_;
$self->{content} =~ s/^\s+//;
$self->{content} =~ s/\s+$//;
return $self if length $self->{content} <= $max_weight;
my @words = split /\s+/, $self->{content};
for (reverse(0 .. @words / 2)) {
my $trial = get_context($_, $self->{content}, @{ $self->{marked_words} });
if (length $trial < $max_weight) {
$self->{content} = $trial;
return $self;
}
}
$self->{content} = join " ... ", @{ $self->{marked_words} };
return $self; # Should not happen.
}
sub as_text { return $_[0]->{content} }
sub marked_up {
my $self = shift;
my $start_tag = shift || DEFAULT_START_TAG;
my $end_tag = shift || DEFAULT_END_TAG;
my $content = $self->as_text;
# Need to escape entities in here.
my $re = join "|", map { qr/\Q$_\E/i } @{ $self->{marked_words} };
my $re2 = qr/\b($re)\b/i;
my @fragments = split /$re2/i, $content;
my $output;
for my $orig_frag (@fragments) {
my $frag = encode_entities($orig_frag);
if ($orig_frag =~ /$re2/i) {
$frag = $start_tag . $frag . $end_tag;
}
$output .= $frag;
}
return $output;
}
1;
Text-Context-3.7/lib/Text/Context.pm 0000644 0000765 0000024 00000014256 11232015174 016265 0 ustar tony staff package Text::Context;
use strict;
use warnings;
use UNIVERSAL::require;
our $VERSION = "3.7";
=head1 NAME
Text::Context - Handle highlighting search result context snippets
=head1 SYNOPSIS
use Text::Context;
my $snippet = Text::Context->new($text, @keywords);
$snippet->keywords("foo", "bar"); # In case you change your mind
print $snippet->as_html;
print $snippet->as_text;
=head1 DESCRIPTION
Given a piece of text and some search terms, produces an object
which locates the search terms in the message, extracts a reasonable-length
string containing all the search terms, and optionally dumps the string out
as HTML text with the search terms highlighted in bold.
=head2 new
Creates a new snippet object for holding and formatting context for
search terms.
=cut
sub new {
my ($class, $text, @keywords) = @_;
my $self = bless { text => $text, keywords => [] }, $class;
$self->keywords(@keywords);
return $self;
}
=head2 keywords
Accessor method to get/set keywords. As the context search is done
case-insensitively, the keywords will be lower-cased.
=cut
sub keywords {
my ($self, @keywords) = @_;
$self->{keywords} = [ map { s/\s+/ /g; lc $_ } @keywords ] if @keywords;
return @{ $self->{keywords} };
}
=begin maintenance
=head2 prepare_text
Turns the text into a set of Paragraph objects, collapsing multiple
spaces in the text and feeding the paragraphs, in order, onto the
C member.
=head2 para_class
The Paragraph class to use. This defaults to 'Text::Context::Para'
=end maintenance
=cut
sub para_class { "Text::Context::Para" }
sub prepare_text {
my $self = shift;
my @paras = split /\n\n/, $self->{text};
for (0 .. $#paras) {
my $x = $paras[$_];
$x =~ s/\s+/ /g;
$self->para_class->require;
push @{ $self->{text_a} }, $self->para_class->new($x, $_);
}
}
=begin maintenance
=head2 permute_keywords
This is very clever. To determine which keywords "apply" to a given
paragraph, we first produce a set of all possible keyword sets. For
instance, given "a", "b" and "c", we want to produce
a b c
a b
a c
a
b c
b
c
We do this by counting in binary, and then mapping the counts onto
keywords.
=end maintenance
=cut
sub permute_keywords {
my $self = shift;
my @permutation;
for my $bitstring (1 .. (2**@{ $self->{keywords} }) - 1) {
my @thisperm;
for my $bitmask (0 .. @{ $self->{keywords} } - 1) {
push @thisperm, $self->{keywords}[$bitmask]
if $bitstring & 2**$bitmask;
}
push @permutation, \@thisperm;
}
return reverse @permutation;
}
=begin maintenance
=head2 score_para / get_appropriate_paras
Now we want to find a "score" for this paragraph, finding the best set
of keywords which "apply" to it. We favour keyword sets which have a
large number of matches (obviously a paragraph is better if it matches
"a" and "c" than if it just matches "a") and with multi-word keywords.
(A paragraph which matches "fresh cheese sandwiches" en bloc is worth
picking out, even if it has no other matches.)
=end maintenance
=cut
sub score_para {
my ($self, $para) = @_;
my $content = $para->{content};
my %matches;
# Do all the matching of keywords in advance of the boring
# permutation bit
for my $word (@{ $self->{keywords} }) {
my $word_score = 0;
$word_score += 1 + ($content =~ tr/ / /) if $content =~ /\b\Q$word\E\b/i;
$matches{$word} = $word_score;
}
#XXX : Possible optimization: Give up if there are no matches
for my $wordset ($self->permute_keywords) {
my $this_score = 0;
$this_score += $matches{$_} for @$wordset;
$para->{scoretable}[$this_score] = $wordset if $this_score > @$wordset;
}
$para->{final_score} = $#{ $para->{scoretable} };
}
sub _set_intersection {
my %union;
my %isect;
for (@_) { $union{$_}++ && ($isect{$_} = $_) }
return values %isect;
}
sub _set_difference {
my ($a, $b) = @_;
my %seen;
@seen{@$b} = ();
return grep { !exists $seen{$_} } @$a;
}
sub get_appropriate_paras {
my $self = shift;
my @app_paras;
my @keywords = @{ $self->{keywords} };
my @paras =
sort { $b->{final_score} <=> $a->{final_score} } @{ $self->{text_a} };
for my $para (@paras) {
my @words = _set_intersection($para->best_keywords, @keywords);
if (@words) {
@keywords = _set_difference(\@keywords, \@words);
$para->{marked_words} = \@words;
push @app_paras, $para;
last if !@keywords;
}
}
$self->{app_paras} = [ sort { $a->{order} <=> $b->{order} } @app_paras ];
return @{ $self->{app_paras} };
}
=head2 paras
@paras = $self->paras($maxlen)
Return shortened paragraphs to fit together into a snippet of at most
C<$maxlen> characters.
=cut
sub paras {
my $self = shift;
my $max_len = shift || 80;
$self->prepare_text;
$self->score_para($_) for @{ $self->{text_a} };
my @paras = $self->get_appropriate_paras;
return unless @paras;
# XXX: Algorithm may get better here by considering number of marked
# up words as weight
return map { $_->slim($max_len / @paras) } $self->get_appropriate_paras;
}
=head2 as_text
Calculates a "representative" string which contains
the given search terms. If there's lots and lots of context between the
terms, it's replaced with an ellipsis.
=cut
sub as_text {
return join " ... ", map { $_->as_text } $_[0]->paras;
}
=head2 as_html([ start => "", end => "" ])
Markup the snippet as a HTML string using the specified delimiters or
with a default set of delimiters (Cspan class="quoted"E>).
=cut
sub as_html {
my $self = shift;
my %args = @_;
my ($start, $end) = @args{qw(start end)};
return join " ... ", map { $_->marked_up($start, $end) } $self->paras;
}
=head1 AUTHOR
Original author: Simon Cozens
Current maintainer: Tony Bowden
=head1 BUGS and QUERIES
Please direct all correspondence regarding this module to:
bug-Text-Context@rt.cpan.org
=head1 COPYRIGHT AND LICENSE
Copyright (C) 2002-2005 Kasei
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License; either version
2 of the License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
=cut
1;
Text-Context-3.7/Makefile.PL 0000644 0000765 0000024 00000000432 11232015151 014545 0 ustar tony staff use ExtUtils::MakeMaker;
WriteMakefile(
'NAME' => 'Text::Context',
'VERSION_FROM' => 'lib/Text/Context.pm',
'PREREQ_PM' => {
'Text::Context::EitherSide' => 1.1,
'UNIVERSAL::require' => 0.03,
'HTML::Entities' => 0,
},
);
Text-Context-3.7/MANIFEST 0000644 0000765 0000024 00000000263 10307407606 013742 0 ustar tony staff Changes
lib/Text/Context.pm
lib/Text/Context/Para.pm
Makefile.PL
MANIFEST
META.yml Module meta-data (added by MakeMaker)
README
t/1.t
t/2.t
t/3.t
t/4.t
t/pod-coverage.t
t/pod.t
Text-Context-3.7/META.yml 0000600 0000765 0000024 00000000704 11232015340 014036 0 ustar tony staff --- #YAML:1.0
name: Text-Context
version: 3.7
abstract: ~
license: ~
author: ~
generated_by: ExtUtils::MakeMaker version 6.44
distribution_type: module
requires:
HTML::Entities: 0
Text::Context::EitherSide: 1.1
UNIVERSAL::require: 0.03
meta-spec:
url: http://module-build.sourceforge.net/META-spec-v1.3.html
version: 1.3
Text-Context-3.7/README 0000644 0000765 0000024 00000003774 10307410120 013464 0 ustar tony staff NAME
Text::Context - Handle highlighting search result context snippets
SYNOPSIS
use Text::Context;
my $snippet = Text::Context->new($text, @keywords);
$snippet->keywords("foo", "bar"); # In case you change your mind
print $snippet->as_html;
print $snippet->as_text;
DESCRIPTION
Given a piece of text and some search terms, produces an object which
locates the search terms in the message, extracts a reasonable-length
string containing all the search terms, and optionally dumps the string
out as HTML text with the search terms highlighted in bold.
new
Creates a new snippet object for holding and formatting context for
search terms.
keywords
Accessor method to get/set keywords. As the context search is done
case-insensitively, the keywords will be lower-cased.
paras
@paras = $self->paras($maxlen)
Return shortened paragraphs to fit together into a snippet of at most
$maxlen characters.
as_text
Calculates a "representative" string which contains the given search
terms. If there's lots and lots of context between the terms, it's
replaced with an ellipsis.
as_html([ start => "", end => "" ])
Markup the snippet as a HTML string using the specified delimiters or
with a default set of delimiters ("").
AUTHOR
Original author: Simon Cozens
Current maintainer: Tony Bowden
BUGS and QUERIES
Please direct all correspondence regarding this module to:
bug-Text-Context@rt.cpan.org
COPYRIGHT AND LICENSE
Copyright (C) 2002-2005 Kasei
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License; either version
2 of the License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
Text-Context-3.7/t/ 0000700 0000765 0000024 00000000000 11232015340 013025 5 ustar tony staff Text-Context-3.7/t/1.t 0000644 0000765 0000024 00000002423 07744765001 013410 0 ustar tony staff use Test::More tests => 15;
use_ok "Text::Context";
# Unit tests for the Ruby port
my $s = Text::Context->new("This is a test\n\nAnd so is this.\n\nbut this has more words than the others", "TeSt",
"ThiS", "more words");
isa_ok($s, "Text::Context");
is_deeply([$s->keywords], ["test", "this", "more words"],
"Keywords downcase properly");
$s->prepare_text;
my @things = @{$s->{text_a}};
is @things, 3, "Proper number of paras";
for (@things) { isa_ok $_, "Text::Context::Para" }
is $things[0]->as_text, "This is a test", "Text maintained OK";
for (@things) { $s->score_para($_) }
is $things[0]->{final_score}, 8, "Score is OK (first para)";
is_deeply [$things[0]->best_keywords], ["test", "this"], "Keywords OK";
is $things[-1]->{final_score}, 16, "Score is OK (last para)";
$s->get_appropriate_paras;
my @paras = @{$s->{app_paras}};
is(@paras,2, "We selected two paragraphs");
is_deeply([map{$_->{order}}@paras],[0,2],"We selected the correct paras");
is($paras[0]->marked_up, 'This is a test',
"Can mark self up");
$s = Text::Context->new("This is a test\n\nAnd so is this.\n\nbut this has more words than the others", "TeSt",
"ThiS", "more words");
is($s->as_text, "This is a test ... but this has more words than the others",
"Simple test passed");
Text-Context-3.7/t/2.t 0000644 0000765 0000024 00000006544 07733263405 013420 0 ustar tony staff use strict;
use warnings;
use Test::More tests => 9;
use Text::Context;
undef $/; my $text = ;
{
my $snippet = Text::Context->new($text);
isa_ok($snippet, "Text::Context");
$snippet->keywords(qw(Wadler XQuery));
is(join (" ", $snippet->keywords), "wadler xquery",
"Keywords can be set (and are l/c'ed)");
}
{
my $snippet = Text::Context->new($text, "Wadler", "XQuery");
isa_ok($snippet, "Text::Context");
is(
join (" ", $snippet->keywords),
"wadler xquery",
"Keywords can be set in constructor and retrieved (and are l/c'ed)"
);
}
{
my $snippet = Text::Context->new($text, "Wadler", "XQuery");
my $expected =
"... quoting Phil Wadler, who recently ... "
. "lecture about XQuery said that ...";
is($snippet->as_text, $expected, "...and the text is correct");
}
my $snippet = Text::Context->new($text, "Wadler", "XQuery");
my $expected =
"... quoting Phil Wadler, who recently ... "
. "lecture about XQuery said that ...";
is($snippet->as_html(start => "", end => ""),
$expected, "as_html can take custom delimiters");
$expected =~ s///g;
$expected =~ s/<\/B>/<\/span>/g;
is($snippet->as_html(), $expected,
"as_html uses span as default delimiters");
{
my $snippet = Text::Context->new($text, "functional language");
is($snippet->as_text,
'... XSLT is considered to be a functional language by experts in > this > ...',
"and the text is correct",
);
$snippet = Text::Context->new($text, "functional language");
$snippet->keywords("functional", "language");
is($snippet->as_text,
'... XSLT is considered to be a functional language by experts in > this > ...',
"and the text is correct",
);
}
1;
__DATA__
--- bryan wrote:
>
> >While XSLT is considered to be a functional language by experts in
> this
> >field, it is definitely not a very nice representative of this class
> of
> >programming languages.
>
> OOOOH that's a baaad thing you said. :)
I'm just quoting Phil Wadler, who recently (at the School of Advanced
FP in Oxford, England, August) in his lecture about XQuery said that
"XSLT is probably the most used functional language and the ugliest
one".
>
> Anyway, it seems to me that you prefer Haskell out of the various
> functional languages, do you have a particular reason for this? I
> have
> problems with Haskell, I've tried and I've tried but it's frankly
> quite
> hard for me to follow programs written in Haskell once they get
> beyond
> a
> couple pages when printed, for functional languages I prefer Lisp and
> Erlang. Especially Erlang.
>
> So anyway what do you like especially about Haskell?
Strong typing, polymorphic types, type classes
Higher order functions
Huge expressiveness
Lazy evaluation + pattern matching
The (built-in support for the) very precise (monadic) approach to
encapsulating operations with side effects.
They even joke that once you have specified the types correctly, then
the solution just starts working... :o) and in reality quite often
this is really the case.
But I'm not comparing Haskell to other languages, just saying that I
like it.
=====
Cheers,
Dimitre Novatchev.
http://fxsl.sourceforge.net/ -- the home of FXSL
__________________________________________________
Do you Yahoo!?
Faith Hill - Exclusive Performances, Videos & More
http://faith.yahoo.com
XSL-List info and archive: http://www.mulberrytech.com/xsl/xsl-list
Text-Context-3.7/t/3.t 0000644 0000765 0000024 00000000626 07744765252 013425 0 ustar tony staff #!/usr/bin/perl
use Text::Context;
use Test::More 'no_plan';
# We want to test snippeting of things which should be marked up as HTML
# entities.
my $s = Text::Context->new(' find s&z ', "me", "s&z");
my $output = '<html> find <me> s&z </html>';
is($s->as_html, $output, "entities are handled correctly");
Text-Context-3.7/t/4.t 0000644 0000765 0000024 00000000377 07744770661 013431 0 ustar tony staff use Text::Context;
use Test::More qw(no_plan);
# What if the terms aren't there are all?
my $tc = Text::Context->new("Re: Defect in XBD lround",
"+44 118 9508311 ext 2250", "+44 118 9500110", "josey");
is($#{[$tc->paras]}, -1, "... and it doesn't die");
Text-Context-3.7/t/pod-coverage.t 0000644 0000765 0000024 00000000241 10307406702 015604 0 ustar tony staff use Test::More;
eval "use Test::Pod::Coverage 1.00";
plan skip_all => "Test::Pod::Coverage 1.00 required for testing POD coverage" if $@;
all_pod_coverage_ok();
Text-Context-3.7/t/pod.t 0000644 0000765 0000024 00000000201 10307406702 014007 0 ustar tony staff use Test::More;
eval "use Test::Pod 1.00";
plan skip_all => "Test::Pod 1.00 required for testing POD" if $@;
all_pod_files_ok();