Text-Names-0.46/ 000755 000765 000024 00000000000 13033766307 014416 5 ustar 00dbourget staff 000000 000000 Text-Names-0.46/Changes 000644 000765 000024 00000001542 12675336670 015722 0 ustar 00dbourget staff 000000 000000 Revision history for Perl module Text::Names
0.01 2011-02-07
- original version; created by h2xs 1.23 with options
-AX Text::Names
0.02 2011-02-08
- Fixed dependency issues
0.03 2011-02-08
- Improved doc
0.19 2012-04-25
- Tons of improvements
0.20 2013-03-11
- Removed annoying uninitialized variable warnings
0.30 2013-08-09
- Added guessGender subroutine
0.31 2013-08-10
- Tweaks to guessGender subroutine
0.32 2013-08-10
- Documentation
0.33 2013-08-10
- Fixed Changes file
0.39 2014-03-09
- Fixed issues with reversed names without commas
0.40 2014-04-05
- Fixed issues with O'* names
0.41 2015-01-05
- Added firstnamePrevalence() and surnamePrevalence()
0.42 2015-04-01
- Merged doc fixes from ugexe
0.43 2015-05-15
- Make guessGender work with full names; improved guessGender recall
Text-Names-0.46/lib/ 000755 000765 000024 00000000000 13033766307 015164 5 ustar 00dbourget staff 000000 000000 Text-Names-0.46/Makefile.PL 000644 000765 000024 00000001535 12675336670 016403 0 ustar 00dbourget staff 000000 000000 use 5.0;
use ExtUtils::MakeMaker;
# See lib/ExtUtils/MakeMaker.pm for details of how to influence
# the contents of the Makefile that is written.
WriteMakefile(
NAME => 'Text::Names',
VERSION_FROM => 'lib/Text/Names.pm', # finds $VERSION
PREREQ_PM => {
'Text::Capitalize' => 1.1,
'Text::LevenshteinXS' => 0.03,
'Test::More' => 0.96
}, # e.g., Module::Name => 1.1
($] >= 5.005 ? ## Add these new keywords supported since 5.005
(ABSTRACT_FROM => 'lib/Text/Names.pm', # retrieve abstract from module
AUTHOR => 'David Bourget') : ()),
($ExtUtils::MakeMaker::VERSION < 6.46
? ()
: ( META_MERGE => {
resources => {
repository => 'https://github.com/dbourget/Text-Names',
},
},
)
),
);
Text-Names-0.46/MANIFEST 000644 000765 000024 00000000472 13033766307 015552 0 ustar 00dbourget staff 000000 000000 Changes
lib/Text/Names.pm
Makefile.PL
MANIFEST This list of files
README
t/01-abbreviations.t
t/02-samePerson.t
t/03-parseNames.t
t/04-common.t
META.yml Module YAML meta-data (added by MakeMaker)
META.json Module JSON meta-data (added by MakeMaker)
Text-Names-0.46/META.json 000644 000765 000024 00000002133 13033766307 016036 0 ustar 00dbourget staff 000000 000000 {
"abstract" : "Perl extension for proper name parsing, normalization, recognition, and classification",
"author" : [
"David Bourget"
],
"dynamic_config" : 1,
"generated_by" : "ExtUtils::MakeMaker version 6.66, CPAN::Meta::Converter version 2.133380",
"license" : [
"unknown"
],
"meta-spec" : {
"url" : "http://search.cpan.org/perldoc?CPAN::Meta::Spec",
"version" : "2"
},
"name" : "Text-Names",
"no_index" : {
"directory" : [
"t",
"inc"
]
},
"prereqs" : {
"build" : {
"requires" : {
"ExtUtils::MakeMaker" : "0"
}
},
"configure" : {
"requires" : {
"ExtUtils::MakeMaker" : "0"
}
},
"runtime" : {
"requires" : {
"Test::More" : "0.96",
"Text::Capitalize" : "1.1",
"Text::LevenshteinXS" : "0.03"
}
}
},
"release_status" : "stable",
"resources" : {
"repository" : {
"url" : "https://github.com/dbourget/Text-Names"
}
},
"version" : "0.46"
}
Text-Names-0.46/META.yml 000644 000765 000024 00000001207 13033766307 015667 0 ustar 00dbourget staff 000000 000000 ---
abstract: 'Perl extension for proper name parsing, normalization, recognition, and classification'
author:
- 'David Bourget'
build_requires:
ExtUtils::MakeMaker: 0
configure_requires:
ExtUtils::MakeMaker: 0
dynamic_config: 1
generated_by: 'ExtUtils::MakeMaker version 6.66, CPAN::Meta::Converter version 2.133380'
license: unknown
meta-spec:
url: http://module-build.sourceforge.net/META-spec-v1.4.html
version: 1.4
name: Text-Names
no_index:
directory:
- t
- inc
requires:
Test::More: 0.96
Text::Capitalize: 1.1
Text::LevenshteinXS: 0.03
resources:
repository: https://github.com/dbourget/Text-Names
version: 0.46
Text-Names-0.46/README 000644 000765 000024 00000001122 12675336670 015301 0 ustar 00dbourget staff 000000 000000 Text-Names
=======================
A Perl extension for name parsing and normalization. See POD for details.
INSTALLATION
To install this module type the following:
perl Makefile.PL
make
make test
make install
DEPENDENCIES
This module requires these other modules and libraries:
Text::Capitalize
COPYRIGHT AND LICENCE
Copyright (C) 2011-2013 by David Bourget
This library is free software; you can redistribute it and/or modify
it under the same terms as Perl itself, either Perl version 5.10.1 or,
at your option, any later version of Perl 5 you may have available.
Text-Names-0.46/t/ 000755 000765 000024 00000000000 13033766307 014661 5 ustar 00dbourget staff 000000 000000 Text-Names-0.46/t/01-abbreviations.t 000644 000765 000024 00000000434 12675336670 020124 0 ustar 00dbourget staff 000000 000000 use Text::Names qw/abbreviationOf/;
use Test::More;
ok(abbreviationOf('Dave','David'),'Dave -> David');
ok(abbreviationOf('Mike','Michael'),'Mike -> Michael');
ok(abbreviationOf('Bella','Belinda'),'Bella -> Belinda');
ok(!abbreviationOf('John','Bob'),'John !> Bob');
done_testing;
Text-Names-0.46/t/02-samePerson.t 000644 000765 000024 00000002250 12675343463 017405 0 ustar 00dbourget staff 000000 000000 use Text::Names qw/samePerson/;
use Test::More;
ok(samePerson('Dave Bourget','David Bourget'),'Dave Bourget -> David Bourget');
ok(samePerson('Dave J. Bourget','David John Bourget'),'Dave J. Bourget -> David John Bourget');
ok(samePerson('D J Bourget','David John Bourget'),'D J Bourget -> David John Bourget');
ok(samePerson('D Bourget','David John Bourget'),'D Bourget -> David John Bourget');
ok(samePerson('Bourget, David', 'Dave Bourget Jr'), 'Bourget, David -> Dave Bourget Jr');
ok(samePerson('Dave Bourget','Bourget, David F.'),'Dave Bourget -> Bourget, David F.');
ok(!samePerson('J Bourget','David John Bourget'),'J Bourget !> David John Bourget');
ok(!samePerson('D F Bourget','David John Bourget'),'D F Bourget !> David John Bourget');
ok(!samePerson('John Doe','David John Bourget'),'John Doe !> David John Bourget');
is( samePerson("Bourget, David J.","Bourget, David"), "Bourget, David J.", "Bourget, David J as return value");
is( samePerson("Bourget, David J.","Bourget, David X."), undef, "Not compatible = undef");
ok(samePerson('Fredrik Björklund','F Bjorklund'),'Björklund, Fredrik');
ok( samePerson("Bourget, David", "David, Bourget", "loose" =>1 ) );
done_testing;
Text-Names-0.46/t/03-parseNames.t 000644 000765 000024 00000005674 12675336670 017407 0 ustar 00dbourget staff 000000 000000 use Text::Names 'parseNames','cleanName','composeName','parseName';
use Test::More;
use utf8;
binmode(STDOUT,":utf8");
is(cleanName("T.H. Ho"),"Ho, T. H.");
is(cleanName("Ho, T. . ."),"Ho, T.");
is(cleanName("D.'Arms, D."), "D'Arms, D.");
is(cleanName("D.’Arms, D."), "D'Arms, D.");
is(cleanName("D’Arms, D."), "D’Arms, D.");
my ($f,$l) = parseName("D’Arms, D.");
is($l,"D’Arms");
my %tests = (
'Kuehni, R. G., Hardin, C. L.' => 'Kuehni, R. G.; Hardin, C. L.',
'Bourget, David; Doe, John' => 'Bourget, David; Doe, John',
'David Bourget & John Doe' => 'Bourget, David; Doe, John',
'David Bourget and John Doe' => 'Bourget, David; Doe, John',
'Bourget D, Doe J' => 'Bourget, D.; Doe, J.',
'Bourget DJR' => 'Bourget, D. J. R.',
'Bourget, D.J.R.' => 'Bourget, D. J. R.',
'Bourget D.J.R.' => 'Bourget, D. J. R.',
'D.J. Bourget' => 'Bourget, D. J.',
'Bourget, DAVID' => 'Bourget, DAVID',
'David BOURGET' => 'BOURGET, David',
'David Chalmers, David Bourget and John Doe' => 'Chalmers, David; Bourget, David; Doe, John',
'Chalmers, David, Bourget, David, Doe, John' => 'Chalmers, David; Bourget, David; Doe, John',
'Chalmers, David John, Bourget, David, Doe, John C.' => 'Chalmers, David John; Bourget, David; Doe, John C.',
'DAVID BOURGET' => 'BOURGET, DAVID',
'John Doe Jr' => 'Doe Jr, John',
'John M. Doe Jr' => 'Doe Jr, John M.',
'Dr Afsar Abbas' => 'Abbas, Afsar',
'R. de Sousa' => 'de Sousa, R.',
'Jean Claude van Damme' => 'van Damme, Jean Claude',
'Dr. Jean Claude van Damme, Prof R de Sousa' => 'van Damme, Jean Claude; de Sousa, R.',
"Maureen A. O'Malley" => "O'Malley, Maureen A.",
"Gusmão da Silva, Guilherme" => "Gusmão da Silva, Guilherme",
"D Bourget, Zbigniew Z Lukasiak and John Doe" => "Bourget, D.; Lukasiak, Zbigniew Z.; Doe, John",
"Bourget, D and John Doe" => "Bourget, D.; Doe, John",
"Bourget, D, Chalmers C, and John Doe" => "Bourget, D.; Chalmers, C.; Doe, John",
cleanName("Guilherme Gusmão da Silva") => "da Silva, Guilherme Gusmão",
cleanName("van Untouched, Firstname") => "van Untouched, Firstname",
cleanName("Van Untouched, Firstname") => "Van Untouched, Firstname",
cleanName("VAN TOUCHED, firstname") => "van Touched, Firstname",
cleanName("CL Adams") => "Adams, C. L.",
cleanName("Hacker, PMS") => "Hacker, P. M. S.",
cleanName("van fraassen b") => "van Fraassen, B.",
cleanName("van fraassen b c") => "van Fraassen, B. C.",
cleanName("RawlsJ.") => "Rawls, J.",
cleanName("RawlsJ.C.") => "Rawls, J. C.",
cleanName("RawlsJC") => "Rawls, J. C.",
cleanName("McKim, John") => "McKim, John",
cleanName("John McKim") => "McKim, John",
cleanName("McKim") => "McKim, "
);
is(cleanName("Hacker, PMS"),"Hacker, P. M. S.");
is(cleanName("Doe, Bob"),"Doe, Bob");
#print cleanName("Guilherme Gusmão da Silva");
foreach my $t (keys %tests) {
my $r = join('; ',parseNames($t));
is( $r, $tests{$t}, "$t -> $r" );
}
done_testing;
Text-Names-0.46/t/04-common.t 000644 000765 000024 00000003327 13033732023 016550 0 ustar 00dbourget staff 000000 000000 use Test::More;
use Text::Names qw/isCommonSurname isCommonFirstname guessGender firstnamePrevalence surnamePrevalence isLikelyMisparsed/;
ok(isCommonSurname('Smith'),'No threshold test, positive');
ok(isCommonSurname('Kennedy'),'No threshold test, positive');
ok(!isCommonSurname('Sdakljdslkafdjdfsa'),'No threshold test, negative');
ok(isCommonSurname('Smith',0.01),'Threshold test, positive');
ok(!isCommonSurname('Smith',5),'Threshold test, negative');
ok(isCommonFirstname('Mary'),'Firstname no threshold test, positive');
ok(isCommonFirstname('David'),'Firstname no threshold test, positive');
ok(!isCommonFirstname('Aklajldkfsjfd'),'Firstname no threshold test, negative');
ok(isCommonFirstname('Mary',0.5),'Firstname threshold test, positive');
ok(isCommonFirstname('John',1),'Firstname threshold test, positive');
ok(!isCommonFirstname('Mark',3),'Firstname threshold test, negative');
ok(guessGender('David') eq 'M');
ok(guessGender('lkjasdf') == undef);
ok(guessGender('Mary') eq 'F');
ok(guessGender('Arthur') eq 'M');
ok(guessGender('William') eq 'M');
is(guessGender('Christian'), 'M');
is(guessGender('Arthur Flintstone'), 'M');
is(guessGender("Christian Loew"),"M");
is(guessGender('Natalia'), 'F');
is(guessGender('Ana'), 'F');
is(guessGender('Eleni'), 'F');
ok(firstnamePrevalence('David') > 1);
ok(surnamePrevalence('Smith') > 1);
ok(firstnamePrevalence('Angela') > 0);
ok(surnamePrevalence('Bourvici') == 0);
ok(isLikelyMisparsed('Bourget, David John Richard Bill'));
ok(isLikelyMisparsed('Alexia, Smith'));
ok(!isLikelyMisparsed('Bourget, David'));
ok(isLikelyMisparsed('Bourget, David Dr'), 'Misparsed is likely misparsed');
ok(isLikelyMisparsed('Gilbert Daniel R.'));
ok(!isLikelyMisparsed('Smith, Daniel R.'));
done_testing;
Text-Names-0.46/lib/Text/ 000755 000765 000024 00000000000 13033766307 016110 5 ustar 00dbourget staff 000000 000000 Text-Names-0.46/lib/Text/Names.pm 000644 000765 000024 00001030007 13033766144 017511 0 ustar 00dbourget staff 000000 000000 package Text::Names;
use 5.0;
use strict;
use warnings;
use Text::Capitalize qw(capitalize_title @exceptions);
use Text::LevenshteinXS qw(distance);
use Unicode::Normalize;
use utf8;
require Exporter;
our @ISA = qw(Exporter);
our %EXPORT_TAGS = ( 'all' => [ qw(
@NAME_PREFIXES
abbreviationOf
reverseName
cleanParseName
parseName
parseName2
normalizeNameWhitespace
samePerson
sameAuthors
parseNames
parseNameList
cleanNames
cleanName
weakenings
composeName
abbreviationOf
setNameAbbreviations
getNameAbbreviations
isCommonSurname
isCommonFirstname
guessGender
firstnamePrevalence
surnamePrevalence
isMisparsed
isLikelyMisparsed
) ] );
our @EXPORT_OK = ( @{ $EXPORT_TAGS{'all'} } );
our @EXPORT = ();
our $VERSION = '0.46';
#
# Follows a ton of data this module uses and makes available
# Search for 'CODE' (all caps) to skip ahead to the code
#
our @NAME_PREFIXES = qw(de di du da le la van von der den des ten ter y e);
@Text::Capitalize::exceptions = qw(
a an the as s
on is its für à les des au aux o y
and or nor for but so yet
to of by at for but in with has
quot amp
);
push @Text::Capitalize::exceptions, @NAME_PREFIXES;
my $APOST = "(?:’|')";
$Text::Capitalize::word_rule = qr{ ([^\w\s]*) # $1 - leading punctuation
# (e.g. ellipsis, leading apostrophe)
([\w']*) # $2 - the word itself (includes non-leading apostrophes AND HTML ENTITIES)
([^\w\s]*) # $3 - trailing punctuation
# (e.g. comma, ellipsis, period)
(\s*) # $4 - trailing whitespace
# (usually " ", though at EOL prob "")
}x ;
our @ABBREVIATIONS = (
'Ab' => 'Abner',
'Abbie' => 'Abigail',
'Abby' => 'Abigail',
'Abe' => 'Abel',
'Abe' => 'Abraham',
'Abe' => 'Abram',
'Acer' => 'Acera',
'Ada' => 'Adeline',
'Addie' => 'Adelaide',
'Ag' => 'Agatha',
'Aggy' => 'Agatha',
'Agnes' => 'Agatha',
'Agnes' => 'Inez',
'Al' => 'Albert',
'Al' => 'Alexander',
'Al' => 'Alfred',
'Alec' => 'Alexander',
'Alex' => 'Alexander',
'Alf' => 'Alfred',
'Amy' => 'Amanda',
'Amy' => 'Amelia',
'Andy' => 'Andreas',
'Andy' => 'Andrew',
'Angie' => 'Angeline',
'Ann' => 'Deanne',
'Ann' => 'Hannah',
'Ann' => 'Susanna',
'Anna' => 'Hannah',
'Anna' => 'Susanna',
'Anne' => 'Hannah',
'Anne' => 'Susanna',
'Annette' => 'Ann',
'Annette' => 'Anna',
'Annie' => 'Ann',
'Annie' => 'Anna',
'Annie' => 'Hannah',
'Annie' => 'Susanna',
'Appy' => 'Apollonia',
'Archy' => 'Archibald',
'Arnie' => 'Arnold',
'Arny' => 'Arnold',
'Art' => 'Artemis',
'Art' => 'Arthur',
'Arty' => 'Arthur',
'Bab' => 'Barbara',
'Babs' => 'Barbara',
'Barb' => 'Barbara',
'Barney' => 'Barnabas',
'Bart' => 'Bartholomew',
'Barty' => 'Bartholomew',
'Bass' => 'Sebastian',
'Bea' => 'Beatta',
'Bea' => 'Beatrice',
'Beattie' => 'Beatrice',
'Becky' => 'Rebecca',
'Bella' => 'Arabella',
'Bella' => 'Belinda',
'Bella' => 'Elizabeth',
'Bella' => 'Isabel',
'Bella' => 'Isabella',
'Bella' => 'Mirabel',
'Belle' => 'Mabel',
'Belle' => 'Sybil',
'Ben' => 'Benedict',
'Ben' => 'Benjamin',
'Bert' => 'Delbert',
'Bert' => 'Egbert',
'Bertie' => 'Albert',
'Bertie' => 'Gilbert',
'Bess' => 'Elizabeth',
'Bessie' => 'Elizabeth',
'Beth' => 'Elizabeth',
'Beto' => 'Alberto',
'Betsy' => 'Elizabeth',
'Betty' => 'Elizabeth',
'Bev' => 'Beverly',
'Bill' => 'William',
'Bob' => 'Robert',
'Burt' => 'Egbert',
'Cal' => 'Caleb',
'Cal' => 'Calvin',
'Carol' => 'Caroline',
'Cassie' => 'Cassandra',
'Cathy' => 'Catherine',
'Caty' => 'Catherine',
'Cecily' => 'Cecilia',
'Charlie' => 'Charles',
'Chet' => 'Chester',
'Chris' => 'Christian',
'Chris' => 'Christine',
'Chris' => 'Crystal',
'Chuck' => 'Charles',
'Cindy' => 'Cynthia',
'Cindy' => 'Lucinda',
'Cissy' => 'Cecilia',
'Cissy' => 'Clarissa',
'Claus' => 'Nicholas',
'Cleat' => 'Cleatus',
'Clem' => 'Clement',
'Clem' => 'Clementine',
'Cliff' => 'Clifford',
'Cliff' => 'Clifton',
'Clo' => 'Chloe',
'Connie' => 'Constance',
'Connie' => 'Cornelia',
'Conny' => 'Cornelia',
'Cora' => 'Corinne',
'Corky' => 'Courtney',
'Cory' => 'Cornelius',
'Creasey' => 'Lucretia',
'Crissy' => 'Christina',
'Crissy' => 'Christine',
'Cy' => 'Cyrus',
'Cyndi' => 'Cynthia',
'Daisy' => 'Margaret',
'Dan' => 'Daniel',
'Danny' => 'Daniel',
'Dave' => 'David',
'Davy' => 'David',
'Deb' => 'Deborah',
'Debby' => 'Deborah',
'Dee' => 'Deanne',
'Deedee' => 'Diedre',
'Delia' => 'Bridget',
'Delia' => 'Cordelia',
'Delia' => 'Fidelia',
'Della' => 'Delilah',
'Derick' => 'Frederick',
'Di' => 'Diana',
'Di' => 'Diane',
'Dicey' => 'Edith',
'Dicey' => 'Elizabeth',
'Dicey' => 'Eurydice',
'Dick' => 'Richard',
'Didi' => 'Diana',
'Didi' => 'Diane',
'Doc' => 'name given to 7th child',
'Doctor' => 'name given to 7th child',
'Dodie' => 'Delores',
'Dolly' => 'Dorothy',
'Dolly' => 'Margaret',
'Dolly' => 'Martha',
'Dora' => 'Dorothy',
'Dora' => 'Eudora',
'Dora' => 'Isadora',
'Dotty' => 'Dorothy',
'Doug' => 'Douglas',
'Drew' => 'Andrew',
'Eck' => 'Alexander',
'Ed' => 'Edmund',
'Ed' => 'Edward',
'Edie' => 'Edith',
'Effie' => 'Euphemia',
'Elaine' => 'Eleanor',
'Eli' => 'Elijah',
'Eli' => 'Elisha',
'Ella' => 'Eleanor',
'Ella' => 'Gabriella',
'Ella' => 'Luella',
'Ellen' => 'Eleanor',
'Ellie' => 'Danielle',
'Ellie' => 'Eleanor',
'Ellie' => 'Emily',
'Ellie' => 'Gabriella',
'Ellie' => 'Luella',
'Elly' => 'Eleanor',
'Eloise' => 'Heloise',
'Elsie' => 'Elizabeth',
'Emily' => 'Emeline',
'Emma' => 'Emily',
'Eph' => 'Ephraim',
'Erma' => 'Emily',
'Erna' => 'Earnestine',
'Ernie' => 'Earnest',
'Ernie' => 'Earnestine',
'Etta' => 'Loretta',
'Ev' => 'Evangeline',
'Ev' => 'Evelyn',
'Eve' => 'Evelyn',
'Evie' => 'Evelyn',
'Fan' => 'Frances',
'Fanny' => 'Frances',
'Fanny' => 'Veronica',
'Fay' => 'Faith',
'Fina' => 'Josephine',
'Flo' => 'Florence',
'Flora' => 'Florence',
'Flossie' => 'Florence',
'Fran' => 'Frances',
'Frances' => 'Franziska',
'Frank' => 'Francis',
'Frank' => 'Franklin',
'Frankie' => 'Frances',
'Fred' => 'Ferdinand',
'Fred' => 'Frederick',
'Fred' => 'Gottfried',
'Freddie' => 'Frederick',
'Fritz' => 'Frederick',
'Gab' => 'Gabriel',
'Gabby' => 'Gabrielle',
'Gabe' => 'Gabriel',
'Gene' => 'Eugene',
'Genny' => 'Gwenevere',
'Geoff' => 'Geoffrey',
'Gerry' => 'Gerald',
'Gus' => 'Augustus',
'Gus' => 'Gustaf',
'Ham' => 'Hamilton',
'Hank' => 'Henry',
'Hanna' => 'Johanna',
'Hans' => 'Johan',
'Hans' => 'Johannes',
'Harry' => 'Henry',
'Harry' => 'Horace',
'Helen' => 'Eleanor',
'Hester' => 'Esther',
'Ibby' => 'Elizabeth',
'Iggy' => 'Ignatius',
'Issy' => 'Isabella',
'Issy' => 'Isadora',
'Jack' => 'John',
'Jack' => 'Jackson',
'Jackie' => 'Jacqueline',
'Jake' => 'Jacob',
'Jan' => 'Jennifer',
'Jane' => 'Janet',
'Jane' => 'Virginia',
'Jed' => 'Jedediah',
'Jeff' => 'Jeffrey',
'Jennifer' => 'Winifred',
'Jenny' => 'Jennifer',
'Jeremy' => 'Jeremiah',
'Jerry' => 'Jeremiah',
'Jill' => 'Julia',
'Jim' => 'James',
'Jimmy' => 'James',
'Joe' => 'Joseph',
'Joey' => 'Joseph',
'Johnny' => 'John',
'Jon' => 'Jonathan',
'Josh' => 'Joshua',
'Josie' => 'Josephine',
'Joy' => 'Joyce',
'Judy' => 'Judith',
'Kate' => 'Catherine',
'Kathy' => 'Katherine',
'Kathy' => 'Kathlene',
'Katie' => 'Katherine',
'Kissy' => 'Calista',
'Kit' => 'Christopher',
'Kitty' => 'Catherine',
'Klaus' => 'Nicholas',
'Lana' => 'Eleanor',
'Len' => 'Leonard',
'Lena' => 'Magdalena',
'Leno' => 'Felipe',
'Lenora' => 'Eleanor',
'Leo' => 'Leonard',
'Leon' => 'Leonard',
'Lettie' => 'Letitia',
'Lew' => 'Lewis',
'Libby' => 'Elizabeth',
'Lila' => 'Delilah',
'Lisa' => 'Elisa',
'Liz' => 'Elizabeth',
'Liza' => 'Elizabeth',
'Lizzie' => 'Elizabeth',
'Lola' => 'Delores',
'Lorrie' => 'Lorraine',
'Lottie' => 'Charlotte',
'Lou' => 'Louis',
'Louie' => 'Louis',
'Lucy' => 'Lucille',
'Lucy' => 'Lucinda',
'Mabel' => 'Mehitable',
'Maddie' => 'Madeline',
'Maddy' => 'Madeline',
'Madge' => 'Margaret',
'Maggie' => 'Margaret',
'Maggy' => 'Margaret',
'Mame' => 'Margaret',
'Mame' => 'Mary',
'Mamie' => 'Margaret',
'Mamie' => 'Mary',
'Manda' => 'Amanda',
'Mandy' => 'Amanda',
'Mandy' => 'Samantha',
'Manny' => 'Emanuel',
'Manthy' => 'Samantha',
'Marcy' => 'Marcia',
'Marge' => 'Margaret',
'Marge' => 'Marjorie',
'Margie' => 'Margaret',
'Margie' => 'Marjorie',
'Marty' => 'Martha',
'Marv' => 'Marvin',
'Mat' => 'Mathew',
'Matt' => 'Mathew',
'Matt' => 'Matthias',
'Maud' => 'Magdalene',
'Maud' => 'Matilda',
'Maude' => 'Magdalene',
'Maude' => 'Matilda',
'Maury' => 'Maurice',
'Max' => 'Maximilian',
'Max' => 'Maxwell',
'May' => 'Margaret',
'Meg' => 'Margaret',
'Mel' => 'Melvin',
'Mena' => 'Philomena',
'Merv' => 'Mervin',
'Meta' => 'Margareta',
'Mick' => 'Michael',
'Mickey' => 'Michael',
'Midge' => 'Margaret',
'Mike' => 'Michael',
'Millie' => 'Emeline',
'Milly' => 'Millicent',
'Milt' => 'Milton',
'Mimi' => 'Mary',
'Mimi' => 'Wilhelmina',
'Mina' => 'Wilhelmina',
'Mini' => 'Minerva',
'Minnie' => 'Minerva',
'Mira' => 'Elmira',
'Mira' => 'Mirabel',
'Mischa' => 'Michael',
'Mitch' => 'Mitchell',
'Moll' => 'Martha',
'Moll' => 'Mary',
'Molly' => 'Martha',
'Molly' => 'Mary',
'Mona' => 'Ramona',
'Mort' => 'Mortimer',
'Mort' => 'Morton',
'Morty' => 'Mortimer',
'Morty' => 'Morton',
'Mur' => 'Muriel',
'Myra' => 'Almira',
'Nab' => 'Abel',
'Nabby' => 'Abigail',
'Nacho' => 'Ignacio',
'Nadia' => 'Nadine',
'Nan' => 'Ann',
'Nan' => 'Hannah',
'Nan' => 'Nancy',
'Nana' => 'Ann',
'Nana' => 'Hannah',
'Nana' => 'Nancy',
'Nate' => 'Nathan',
'Nate' => 'Nathaniel',
'Ned' => 'Edmund',
'Ned' => 'Edward',
'Ned' => 'Norton',
'Neely' => 'Cornelia',
'Neil' => 'Cornelius',
'Neil' => 'Edward',
'Nell' => 'Cornelia',
'Nell' => 'Eleanor',
'Nell' => 'Ellen',
'Nell' => 'Helen',
'Nellie' => 'Helen',
'Nelly' => 'Cornelia',
'Nelly' => 'Eleanor',
'Nelly' => 'Helen',
'Nessie' => 'Agnes',
'Nettie' => 'Jeanette',
'Netty' => 'Henrietta',
'Nicie' => 'Eunice',
'Nick' => 'Dominic',
'Nick' => 'Nicholas',
'Nicy' => 'Eunice',
'Nikki' => 'Nicole',
'Nina' => 'Ann',
'Nita' => 'Anita',
'Nita' => 'Juanita',
'Noni' => 'Ione',
'Noni' => 'Nora',
'Noni' => 'Eleanor',
'Noni' => 'Elnora',
'Nora' => 'Eleanor',
'Nora' => 'Elnora',
'Nora' => 'Honora',
'Norm' => 'Norman',
'Obed' => 'Obediah',
'Ollie' => 'Oliver',
'Ora' => 'Aurillia',
'Ora' => 'Corinne',
'Pablo' => 'Paul',
'Pacho' => 'Francisco',
'Paco' => 'Francisco',
'Paddy' => 'Patrick',
'Pam' => 'Pamela',
'Pancho' => 'Francisco',
'Pat' => 'Martha',
'Pat' => 'Matilda',
'Pat' => 'Patricia',
'Pat' => 'Patrick',
'Patsy' => 'Martha',
'Patsy' => 'Matilda',
'Patsy' => 'Patricia',
'Patty' => 'Martha',
'Patty' => 'Matilda',
'Patty' => 'Patricia',
'Peg' => 'Margaret',
'Peggy' => 'Margaret',
'Penny' => 'Penelope',
'Pepa' => 'Josefa',
'Pepe' => 'Jose',
'Percy' => 'Percival',
'Pete' => 'Peter',
'Phelia' => 'Orphelia',
'Phil' => 'Philip',
'Phil' => 'Phillip',
'Polly' => 'Mary',
'Polly' => 'Paula',
'Prissy' => 'Priscilla',
'Prudy' => 'Prudence',
'Quil' => 'Aquilla',
'Quillie' => 'Aquilla',
'Rafe' => 'Raphael',
'Randy' => 'Miranda',
'Randy' => 'Randall',
'Randy' => 'Randolph',
'Rasmus' => 'Erasmus',
'Ray' => 'Raymond',
'Reba' => 'Rebecca',
'Reg' => 'Reginald',
'Reggie' => 'Reginald',
'Rena' => 'Irene',
'Rich' => 'Richard',
'Rick' => 'Eric',
'Rick' => 'Frederick',
'Rick' => 'Garrick',
'Rick' => 'Patrick',
'Rick' => 'Richard',
'Rita' => 'Clarita',
'Rita' => 'Margaret',
'Rita' => 'Margarita',
'Rita' => 'Norita',
'Rob' => 'Robert',
'Rod' => 'Roderick',
'Rod' => 'Rodney',
'Rod' => 'Rodrigo',
'Rodie' => 'Rhoda',
'Ron' => 'Aaron',
'Ron' => 'Reginald',
'Ron' => 'Ronald',
'Ronnie' => 'Veronica',
'Ronny' => 'Ronald',
'Rosie' => 'Rosalind',
'Rosie' => 'Rosemary',
'Rosie' => 'Rosetta',
'Roxy' => 'Roxanne',
'Roy' => 'Leroy',
'Rudy' => 'Rudolph',
'Russ' => 'Russell',
'Sadie' => 'Sally',
'Sadie' => 'Sarah',
'Sal' => 'Sarah',
'Sally' => 'Sarah',
'Sam' => 'Samuel',
'Sandy' => 'Alexander',
'Sandy' => 'Sandra',
'Sene' => 'Asenath',
'Senga' => 'Agnes',
'Senie' => 'Asenath',
'Sherm' => 'Sherman',
'Si' => 'Cyrus',
'Si' => 'Matthias',
'Si' => 'Silas',
'Sibella' => 'Isabella',
'Sid' => 'Sidney',
'Silla' => 'Drusilla',
'Silla' => 'Priscilla',
'Silvie' => 'Silvia',
'Sis' => 'Cecilia',
'Sis' => 'Frances',
'Sissy' => 'Cecilia',
'Sol' => 'Solomon',
'Stacia' => 'Eustacia',
'Stacy' => 'Anastasia',
'Stacy' => 'Eustacia',
'Stan' => 'Stanislas',
'Stan' => 'Stanly',
'Stella' => 'Estella',
'Stella' => 'Esther',
'Steve' => 'Steven',
'Steven' => 'Stephen',
'Stew' => 'Stewart',
'Sue' => 'Susan',
'Sue' => 'Suzanne',
'Sukey' => 'Suzanna',
'Susie' => 'Susan',
'Susie' => 'Suzanne',
'Suzy' => 'Susan',
'Suzy' => 'Suzanne',
'Tad' => 'Edward',
'Tad' => 'Thadeus',
'Ted' => 'Edmund',
'Ted' => 'Edward',
'Ted' => 'Theodore',
'Teddy' => 'Edward',
'Teddy' => 'Theodore',
'Telly' => 'Aristotle',
'Terry' => 'Theresa',
'Terza' => 'Theresa',
'Tess' => 'Elizabeth',
'Tess' => 'Theresa',
'Theo' => 'Theobald',
'Theo' => 'Theodore',
'Tia' => 'Antonia',
'Tibbie' => 'Isabella',
'Tilda' => 'Matilda',
'Tilly' => 'Matilda',
'Tilly' => 'Otilia',
'Tim' => 'Timothy',
'Timmy' => 'Timothy',
'Tina' => 'Albertina',
'Tina' => 'Augustina',
'Tina' => 'Christina',
'Tina' => 'Christine',
'Tina' => 'Earnestine',
'Tina' => 'Justina',
'Tina' => 'Martina',
'Tish' => 'Letitia',
'Toby' => 'Tobias',
'Tom' => 'Thomas',
'Tony' => 'Anthony',
'Tracy' => 'Theresa',
'Trina' => 'Katherina',
'Trixie' => 'Beatrice',
'Trudi' => 'Gertrude',
'Trudy' => 'Gertrude',
'Ursie' => 'Ursula',
'Ursy' => 'Ursula',
'Vangie' => 'Evangeline',
'Vern' => 'Vernon',
'Vi' => 'Viola',
'Vi' => 'Violet',
'Vic' => 'Victor',
'Vicky' => 'Victoria',
'Vin' => 'Galvin',
'Vin' => 'Vincent',
'Vina' => 'Alvina',
'Vina' => 'Lavina',
'Vinny' => 'Vincent',
'Virg' => 'Virgil',
'Virgie' => 'Virginia',
'Viv' => 'Vivian',
'Vonnie' => 'Yvonne',
'Wally' => 'Wallace',
'Wally' => 'Walter',
'Walt' => 'Walter',
'Web' => 'Webster',
'Wendy' => 'Gwendolen',
'Wes' => 'Wesley',
'Will' => 'William',
'Willie' => 'Wilhelmina',
'Willy' => 'William',
'Winn' => 'Edwin',
'Winnie' => 'Edwina',
'Winnie' => 'Winifred',
'Woody' => 'Woodrow',
'Xina' => 'Christina',
'Yost' => 'Josef',
'Zac' => 'Isaac',
'Zach' => 'Zachariah',
'Zak' => 'Isaac',
'Zeb' => 'Zebulon',
'Zed' => 'Zedekiah',
'Zeke' => 'Ezekiel',
'Zena' => 'Albertina',
'Zeph' => 'Zephaniah'
);
my %ABBREVIATIONS;
my %ABBREVIATIONS_LC;
setNameAbbreviations(@ABBREVIATIONS);
my $AND = '(?:\s+(?:and|&|&|with)\s+)';
my $MERE_COMMA = '(?:\s*,\s*)';
my $MERE_SEMI = '(?:\s*(?:;|
|
|<\/p>)\s*)';
my $SEMI_AND = "(?:$MERE_SEMI|$AND)";
my $COMMA_AND = "(?:$MERE_COMMA$AND|$AND|$MERE_COMMA)";
my $SPACE = '(?:\s|\ |\n|\r)';
my @NAME_PREFIXES_RE;
for (my $i=0; $i<=$#NAME_PREFIXES; $i++) {
$NAME_PREFIXES_RE[$i] = '(?:$|^|\W)' . $NAME_PREFIXES[$i] . '(?:$|^|\W)';
}
my $PREFIXES = "(?:" . join('|',@NAME_PREFIXES_RE) . ")";
#
# CODE STARTS HERE
#
sub setNameAbbreviations {
while (my $a = shift @_) {
my $b = shift;
$ABBREVIATIONS{$a} ||= {};
$ABBREVIATIONS{$a}->{$b} = 1;
$ABBREVIATIONS_LC{lc $a} ||= {};
$ABBREVIATIONS_LC{lc $a}->{lc $b} = 1;
}
}
sub getNameAbbreviations {
return \%ABBREVIATIONS;
}
our %commonSurnames;
our %commonMaleFirstnames;
our %commonFemaleFirstnames;
our $namesInitialized = 0;
# These three variables initialized below
my $COMMON_SURNAMES;
my $COMMON_MALE_FIRSTNAMES;
my $COMMON_FEMALE_FIRSTNAMES;
sub firstnamePrevalence {
my $name = uc shift;
prepareCommonNames() unless $namesInitialized;
return (($commonMaleFirstnames{$name} || 0) + ($commonFemaleFirstnames{$name} || 0))/2;
}
sub surnamePrevalence {
my $name = uc shift;
prepareCommonNames() unless $namesInitialized;
return $commonSurnames{$name} || 0;
}
sub isCommonFirstname {
my ($name, $percentLimit) = @_;
$percentLimit ||= 0;
return firstnamePrevalence($name) > $percentLimit;
}
sub isCommonSurname {
my ($name, $percentLimit) = @_;
$percentLimit ||= 0;
return surnamePrevalence($name) > $percentLimit;
}
sub isMisparsed {
my ($name) = @_;
$name = lc $name;
return 1 if $name !~ /\w.*,.*\w/;
for my $prefix ((@NAME_PREFIXES,'dr','dr.','prof','prof.','jr','jr.')) {
#warn "check prefix: $prefix";
return 1 if $name =~ /\b$prefix$/i;
}
return 1 if $name =~ /^\w\b/;
return 0;
}
sub isLikelyMisparsed {
my ($name) = @_;
my @parts = parseName2($name);
for my $p (@parts) {
next unless $p;
my @sub_parts = split(/\s+/,$p);
return 1 if $#sub_parts >= 2; # three or more subparts likely a mess
}
# also likely misparsed if firstname containg a likely lastname AND surname contains a likely firstname
my @firstname_parts = split(/\s+/, $parts[0]);
my @first_surname = grep { isCommonSurname($_) } @firstname_parts;
my @surname_parts = split(/\s/, $parts[2]);
my @surname_first = grep { isCommonFirstname($_) } @surname_parts;
return 1 if $#first_surname > -1 and $#surname_first > -1;
return isMisparsed($name);
}
my $fem_ending = qr/(ette|ne|a)$/i;
sub guessGender {
my ($name) = @_;
$name = uc $name;
prepareCommonNames() unless $namesInitialized;
# extract firstname part if necessary
if ($name =~ /[,\s]/) {
my @parts = parseName($name);
$name = $parts[0];
}
my $fscore = $commonFemaleFirstnames{$name};
my $mscore = $commonMaleFirstnames{$name};
return undef if !($fscore || $mscore);
return 'F' if $fscore and !$mscore;
return 'M' if $mscore and !$fscore;
#warn "M: $mscore vs F: $fscore";
my $threshold = 20;
# now the name exist in both. we make a decision if the percentage is very different
# now in both
return 'F' if $fscore / $mscore >= $threshold;
return 'M' if $mscore / $fscore >= $threshold;
# adjust threashold based on common female name endings
my $fem_end = ($name =~ $fem_ending);
if ($fem_end) {
return 'F' if $fscore / $mscore >= ($threshold/4);
} else {
return 'M' if $mscore / $fscore >= ($threshold/4);
}
return undef;
}
sub prepareCommonNames {
for my $n (split(/\n/,$COMMON_SURNAMES)) {
next unless $n =~ /^(.+?)\s+(.+?)\s/;
$commonSurnames{uc $1} = $2;
}
for my $n (split(/\n/,$COMMON_MALE_FIRSTNAMES)) {
next unless $n =~ /^(.+?)\s+(.+?)\s/;
$commonMaleFirstnames{$1} = $2;
}
for my $n (split(/\n/,$COMMON_FEMALE_FIRSTNAMES)) {
next unless $n =~ /^(.+?)\s+(.+?)\s/;
$commonFemaleFirstnames{$1} = $2;
}
$namesInitialized = 1;
}
sub reverseName {
my $n = shift();
return undef unless defined($n);
my @n = split(/,\s*/,$n);
return "$n[1] $n[0]";
}
sub composeName {
my ($given,$last) = @_;
my $r = $last;
$r .= ", $given" if $given;
return $r;
}
sub normalizeNameWhitespace {
my $in = shift;
return undef unless defined $in;
#print "in: $in\n";
# this used to be optional, but then we never know in advance
#my $initialsCanBeLowerCase = shift;
#if ($initialsCanBeLowerCase) {
$in =~ s/\b([a-z])\b/uc $1/ge;
#}
$in =~ s/^\s+//g; # remove initial spaces
$in =~ s/\s+$//g; # remove term spaces
$in =~ s/\s+,/,/g; # remove spaces before coma
$in =~ s/,\s*/, /g; # normalize spaces after coma
$in =~ s/\.\s*([A-Z])/. $1/g; # adjust spacing between initials
#print "in: $in\n";
$in =~ s/([A-Z])\.\s([A-Z])\./$1. $2./g;
$in =~ s/\b([A-Z])\b(?![\.'’])/$1./g;
while ($in =~ s/([\.\s][A-Z])(\s|$)/$1.$2/g) {};
$in =~ s/\.\s*([A-Z])/. $1/g; # adjust spacing between initials
#print "normalized: $in\n";
$in;
}
sub parseName {
my $in = shift;
return undef unless defined $in;
#print "-->parseName in: $in\n";
$in =~ s/^\s*and\s+//;
my $jr = ($in =~ s/,?\sJr\.?(\s|$)//i);
$in =~ s/^\s*by\s+//;
$in =~ s/\W*et\.? al\.?\W*//;
$in =~ s/\.\s*$//; # remove . at the end
#print "$in -->";
$in = normalizeNameWhitespace($in);
#print "$in'\n";
# check if we have a case of Lastname I. without comma
if ($in !~ /,/ and $in=~ /^(.*?\s)((?:[A-Z][\-\.\s]{0,2}){1,3})$/) {
#warn "Got a reversed name without comma: $1, $2";
my $init = $2;
my $rest = $1;
#print "\n\nmatched, rest:$rest--$2\n";
# add . as needed
# if ($init !~ /\./) {
$init =~ s/([A-Z])([^.]|$)/$1.$2/g;
$init =~ s/([A-Z])([^.]|$)/$1.$2/g;
# }
$rest =~ s/\s$//;
$in = normalizeNameWhitespace("$rest, $init");
} elsif ($in =~ /^[^,]+\s\w\.?$/) {
#print "case\n";
$in =~ s/^(.+?)\s((?:[A-Z]\.?-?\s?){1,3})$/$1,$2/;
}
#print "now:$in\n";
# standard cases
if ($in =~ /(.*),\s*(.*)/) {
return ($2, $1);
} else {
my @bits = split(' ',$in);
#print join(" - ", @bits);
if ($#bits == -1) {
return ($in,"");
}
my $lastname = splice(@bits,-1,1);
if ($lastname =~ /^Jr\.?$/i and $#bits > -1) {
$lastname = $bits[-1] . " $lastname";
splice(@bits,-1,1);
}
$lastname = "$lastname Jr" if $jr;
# add prefixes or Jr to lastname
#warn join(" - ",@bits);
while (defined $bits[-1] and $bits[-1] =~ /^$PREFIXES$/i) {
#warn "GOT PREFIX: $bits[-1]";
$lastname = splice(@bits,-1,1) . " $lastname";
}
return (join(' ',@bits),$lastname);
#my $firstname = splice(@bits,0,1);
#while ($#bits > -1 and $bits[0] =~ /^\s*\w\.?\s*$/) {
# $firstname .= " ".splice(@bits,0,1);
#}
#my $lastname = join(' ', @bits);
#return ($firstname, join(' ',@bits));
=crap
my @surnames = $#bits > 1 ? @bits[1,$#bits] : ($bits[1]);
#warn "doing " . join(" ",@bits);
#while ($#bits > 0) {
# @surnames = pop @bits;
#}
#my $surname = pop @bits;
return ($bits[0], join(' ',@surnames));
=cut
}
}
sub parseNames {
my $in = shift;
my $reverse = shift; # means names are stupidly written like this: David, Bourget
return undef if !defined $in;
while($in =~ s/(^|\W)(dr|prof\.? em\.?|prof|profdr|prof|sir|mrs|ms|mr)\.?(\W)/$1 $3/gi) {}
$in =~ s/^\s+//;
$in =~ s/([^A-Z]{2,2})\.\s*/$1/; # remove . at the end
$in =~ s/\(.+\)\s*$//; # remove trailing parens
$in =~ s/(,\s*)?\d\d\d\d-$//;
$in =~ s/^\s*[bB]y(\W)/$1/; #remove "By ";
$in =~ s/,?\s*et\.? al\.?\s*$//; # et al
$in =~ s/^\W+//;
#print "== $in\n";
# semi-colon separated
if ($in =~ /;/) {
return parseNameList(split(/$SEMI_AND/i,$in),$reverse);
}
# no comma and no semi-colon, so one or two not-reversed names
elsif ($in !~ /,/) {
return parseNameList(split(/$AND/i,$in),$reverse);
}
# now that's messy: one or more commas, no semi
else {
# is there a "and"?
#print "$in\n";
if ($in =~ /$AND/i) {
#print "AND:$in\n";
# now we check for double duty for commas
# We fix what would be missing commas on this hypothesis
my $t = $in;
$t =~ s/([a-z])\s+([A-Z])(\.|\s|$)/$1, $2$3/g;
# we check if it's a silly case of commas playing double duty
if ($t =~ /,.+,.+,.+$AND/) {
#print "SILLY: $t\n";
my @to;
my @tokens = split(/$COMMA_AND/i,$t);
for (my $ti=0; $ti <= $#tokens;$ti+=2) {
push @to, join(", ",@tokens[$ti..$ti+1]);
}
return parseNameList(@to,$reverse);
}
# no silliness. what's after the AND will tell us the format
# if there's a comma after, it's probably reversed
if ($in =~ /$AND.*,/i) {
return parseNameList(split(/$SEMI_AND/i,$in),$reverse);
}
else {
my @parts = split(/$COMMA_AND/i,$in);
return parseNameList(@parts,$reverse);
}
} else {
#print "- no and\n";
# no semi, no and, and one or more comma
# if 2 or more commas
if ($in =~ /,.+,/) {
# need to check if this is a silly case of commas with reversed names
# check that by looking for two or more ,token, with only one part, and odd number of ,
my @tokens = split(/$MERE_COMMA/i,$in);
my $silly = 0;
for my $tok (@tokens) {
$silly++ unless $tok =~ m/[\w\.]$SPACE[\w\.]/i;
}
# if silly combination, every other comma separates two names
if ($silly >=2 and $#tokens %2 ==1) {
my @to;
for (my $ti=0; $ti <= $#tokens;$ti+=2) {
push @to, join(", ",@tokens[$ti..$ti+1]);
}
@tokens = @to;
}
return parseNameList(@tokens,$reverse);
}
# else, one comma, no semi, and no and
else {
# now that's ambiguous between "Doe, John" and "John Doe, John Doe"
# but we assume there are no names like "Herrera Abreu, Maria Teresa"
# (which there are, this is a real one). that is, if the comma separates
# two tokens on each side (not counting de,di,von, etc.), we suppose
# these tokens make distinct names
my @toks = split(/,/,$in);
my @copy = @toks;
foreach (@copy) {
s/$PREFIXES|(\WJr(\W|$))/ /ig;
my @bits = split(' ',$_);
if ($#bits <= 0) {
# found one side with only one non-trivial token
# so there is only one author in $in
return parseNameList(($in),$reverse);
}
}
return parseNameList(@toks,$reverse);
}
}
}
return ();
}
sub parseNameList {
my @auths;
#print "Got: " . join("---", @auths) . "\n";
my $reverse;
if (defined($_[-1]) and $_[-1] eq 'reverse') {
pop @_;
$reverse = 1;
}
# first we correct for overly split names like 'Bourget; D; John Doe'
my @new;
#print Dumper(\@_); use Data::Dumper;
for (@_) {
# if the part looks like an initial, we add it to the previous name part
next unless $_;
if (/^([A-Z](\.|\s|$)\s?)+$/ and $#new > -1) {
if ($new[-1] =~ /,/) {
$new[-1] = "$new[-1] $_";
} else {
$new[-1] = "$new[-1], $_";
}
} else {
push @new, $_;
}
}
foreach my $a (@new) {
next unless $a;
my ($f,$l) = parseName($a);
push @auths, ($reverse ? "$f, $l" : "$l, $f");
}
return @auths;
}
sub parseName2 {
my $in = shift;
my ($i,$s);
return ("","") unless defined $in;
my ($l,$f) = split(/,\s*/,$in);
$f ||= '';
$l ||= '';
# get suffix
if ($l =~ s/\s+(Jr\.?|[IV]{2,10})\s*$//) {
$s = $1;
}
#print "f: $f\nl:$l\n";
# separate firstname/initial
# if has only initials
if ($f =~ /^\s*([A-Z](?:\.|\s|$))(.+)$/) {
$f = $1;
$i = $2;
$i =~ s/^\s*//;
}
# has a full firstname
else {
if ($f =~ /^([^\s]+?)\s+((?:[A-Z](?:\.|\s+|$)\s*)+)$/) {
$f = $1;
$i = $2;
}
}
return ($f,$i,$l,$s);
}
sub abbreviationOf {
my ($a,$b) = @_;
return 0 unless $ABBREVIATIONS_LC{lc $a};
return $ABBREVIATIONS_LC{lc $a}->{lc $b} ? 1 : 0;
}
# if the two names passed as params are such that they could belong to the same person, returns a merged name
sub samePerson {
my ($a,$b, %opts) = @_; #name1,name2
return undef if !defined($a) or !defined($b);
my $a_expd = 0;
my $b_expd = 0;
my ($lasta,$firsta) = split(',',cleanName($a,' ','reparse'));
my ($lastb,$firstb) = split(',',cleanName($b,' ','reparse'));
return undef if defined($firsta) and !defined($firstb);
return undef if defined($firstb) and !defined($firsta);
return undef if defined($lasta) and !defined($lastb);
return undef if defined($lastb) and !defined($lasta);
#print "here '$lasta'-'$lastb'\n";
$lasta =~ s/\s+Jr\.?$// if defined $lasta;
$lastb =~ s/\s+Jr\.?$// if defined $lastb;
# check for reversed name if loose
if (!equivtext($lasta,$lastb)) {
if (!$opts{loose}) {
return undef;
} else {
return samePerson("$firsta, $lasta", "$lastb, $firstb", loose=>0) || samePerson("$firsta $lasta","$firstb $lastb", loose=>0);
}
}
=old
# regimentation
$firsta =~ s/\./ /g;
$firstb =~ s/\./ /g;
$firsta =~ s/\s+/ /g;
$firstb =~ s/\s+/ /g;
=cut
my @at = split(" ",$firsta || '');
my @bt = split(" ",$firstb || '');
#print "AT: " . join("-",@at) . "\n";
#print "BT: " . join("-",@bt) . "\n";
# compare each token pair as follows:
# if reached the end of tokens on either side, compat
# if both are greater than 1 char and diff, not compat
# if they don't start by the same letter, not compat
# else merge the tokens, compat so far, move on to next token pair
#
my $merged = "$lasta,";
for (my $i=0; $i <= $#at || $i <= $#bt; $i++) {
#print "$i ($merged):" . $at[$i] . "-" . $bt[$i]. "-\n";
# end of tokens reached on one side
if ($i > $#at) {
#print "END ($merged)\n";
#return undef if $b_expd;
$merged .= " ". join(" ",@bt[$i..$#bt]);
return cleanName($merged,'');
} elsif ($i > $#bt) {
#print "END ($merged)\n";
#return undef if $a_expd;
$merged .= " ". join(" ",@at[$i..$#at]);
return cleanName($merged,'');
}
# if different tokens
if (!equivtext($at[$i],$bt[$i])) {
# if different first letters, not compat
return undef if !equivtext(substr($at[$i],0,1),substr($bt[$i],0,1));
# otherwise they might be compatible
# token a is full word
if (length($at[$i]) > 2) {
# b is too, they are not compat unless one is a short for the other
if (length($bt[$i]) > 2) {
if ( abbreviationOf($at[$i],$bt[$i]) ) {
$merged .= " " . $bt[$i];
next;
} elsif ( abbreviationOf($bt[$i],$at[$i]) ) {
$merged .= " " . $at[$i];
next;
} else {
return undef;
}
}
# b is initial, they are compat so far
else {
$b_expd = 1;
$merged .= " " . $at[$i];
}
# a is initial
} else {
# b is full word
$a_expd = 1 if length($bt[$i]) > 2;
# keep going
$merged .= " " . $bt[$i];
}
}
# otherwise move on to next token pair straight
else {
$merged .= " " .$at[$i];
}
}
# if we get there, the two names are compatible and $merged contains the richest name from the two
# print "merged: $merged\n";
return cleanName($merged,'');
}
sub equivtext {
my ($a,$b) = @_;
$a = lc rmDiacritics($a);
$b = lc rmDiacritics($b);
$a =~ s/\.\s*$//;
$b =~ s/\.\s*$//;
#warn "$a == $b == " . distance($a,$b);
# we allow one character difference, except for initial tokens. this is because some diacritics change the underlying letter when removed, e.g. Björklund -> Bjarklund, even though one might naturally write 'Bjorklund'
return (length($a) > 1 && length($b) > 1) ? distance($a,$b) <= 1 : $a eq $b;
}
sub cleanParseName {
my $n = shift;
# I think that one is overkill..
return parseName(cleanName(composeName(parseName($n))));
}
sub cleanName {
my ($n, $space, $reparse) = @_;
# Some of the cleaning-up here is redundant because also in parseName, which is called last. But it doesn't hurt.. If it works don't try and fix it.
return undef unless defined $n;
#print "Cleaning name: $n\n";
$n =~ s/\.( \.)+/./g;
$n =~ s/\.($APOST)/'/g;
# if ", john doe"
if ($n =~ s/^\s*,\s+//) { }
# if 'john doe,'
if ($n =~ s/^([^,]+?)\s*,\s*$/$1/) { }
$n =~ s/Get checked abstract//g;
$n = rmTags($n);
$n =~ s/, By$//i;
#if ($reparse and $n =~ s/,/) {
# my ($l,$f) = split(/,\s*/,$n);
# my ($f,$l) = parseName(join(' ',($f,$l)));
# $n = "$l, $f";
#}
# Fix for O'Something => O.'Something
#$n =~ s/O\.'/O'/;
$n =~ s/[\r\n]/ /gsm;
$n =~ s/(\w)\s*,/$1,/g;
$n =~ s/([a-z]{2,})\./$1/gi; #remove unwanted .
$n =~ s/(\W|\.|\s)([A-Z])(\s|$)/$1$2.$3/g; #add . to initials
$n =~ s/(\W|\.|\s)([A-Z])(\s|$)/$1$2.$3/g; #add . to initials (repeat for overlaps)
$n =~ s/\.\s*([A-Z])/". " . uc $1/ieg; # adjust spacing between initials
$n =~ s/\W*\d{4,4}\W*//g; # misplaced dates
$n =~ s/\(.*$//; #parentheses
# misplaced jr
$n =~ s/([\w'-])\s*,(.*)\sJr(\s.*|$)/$1 Jr,$2 $3/i;
# misplaced prefixe
#warn $n;
#$n =~ s/([\w'-])\s*,(.*)\s(van|von|von\sder|van\sder|di|de|del|du|da)(\s.*|$)/(lc $3) . $1 . "," . $2 . $4/ie;
#warn $n;
# replace Iep by UNKNOWN (for PP)
$n =~ s/^Iep,$/Unknown, Unknown/;
#links aren't names
$n = "Unknown, Unknown" if $n =~ /http:\/\//;
# capitalize if nocaps
if ($n !~ /[A-Z]/) {
$n = capitalize($n,notSentence=>1);#_title($n, PRESERVE_ANYCAPS=>1, NOT_CAPITALIZED=>\@PREFIXES);
}
# do we have initials stuck on the surname like so: RawlsJ.
unless ($n =~ /,/ or $n =~ /\w \w/) {
$n =~ s/([A-Z][a-z]{1,})((?:[A-Z](?:$|\.|\s|)\s*)+)\s*$/$1, $2/g;
}
#warn "$n";
#unless it's all caps, the caps are initials. we unstuck them and add .
if ($n =~ /[a-z]/ and $n !~ /[A-Z]{2,} [A-Z]{2,}/) {
$n =~ s/(\s|^)([A-Z]{2,3})(\.|\s|$)/$1 . toInitials($2) . $3/ge;
} else {
}
#warn $n;
my ($f,$l) = parseName($n);
#warn "** $l, $f";
#warn "$l, $f";
$n = composeName($f,$l);
# now final capitalization
$n = capitalize($n,notSentence=>1); #,PRESERVE_ANYCAPS=>1, NOT_CAPITALIZED=>\@PREFIXES);
return $n;
}
sub toInitials {
my $s = shift;
#warn "init: $s";
return $s if grep { lc $_ eq lc $s } @NAME_PREFIXES;
$s =~ s/^([A-Z])([A-Z])([A-Z])$/$1. $2. $3./;
$s =~ s/^([A-Z])([A-Z])$/$1. $2./;
return $s;
}
sub weakenings {
my( $firstname, $lastname ) = @_;
my @warnings;
# default firstname aliases: every middle name can be either in full, initialized, or absent
my @first_parts = split(/\s+/,normalizeNameWhitespace($firstname));
my $reduced = 0;
if( scalar @first_parts > 3 ){
$reduced = 1;
splice( @first_parts, 3 );
push @warnings, "Too many parts in first name: '$firstname'\n";
}
my $first = shift @first_parts;
for my $i (0..$#first_parts) {
my $value = $first_parts[$i];
$first_parts[$i] = [$value]; # the default value is good
# try downgrading to initial
push @{$first_parts[$i]}, $value if ($value =~ s/(\w)[^\s\.]+/$1./);
}
my @first_aliases = ( $first );
push @first_aliases, "$1." if $first =~ /(\w)[^\s\.]+/;
#print Dumper(\@first_parts);
for my $i (0..$#first_parts) {
my @new;
for my $current (@first_aliases) {
for (@{$first_parts[$i]}) {
push @new, "$current $_";
}
push @new, $current;
}
@first_aliases = @new;
}
#print Dumper(\@first_aliases);
$lastname = normalizeNameWhitespace($lastname);
my @prefixes = map "\\b$_\\b", @NAME_PREFIXES, 'y', 'los';
my $prefixes = join '|', @prefixes;
$lastname =~ s/($prefixes) /$1+/ig;
my @parts = reverse ( ( split(/\s+/,$lastname) ) );
my @last_aliases;
my $lastlast = shift @parts;
for my $variation ( variations( $lastlast ) ){
push @last_aliases, $variation;
}
if( scalar @parts < 3 ){
for my $lpart ( @parts ){
my @curr = @last_aliases;
for my $variation ( variations( $lpart ) ){
for my $alias ( @curr ){
next if $variation =~ /-/ && $alias =~ / /;
next if $variation =~ / / && $alias =~ /-/;
push @last_aliases, "$variation $alias" if $variation !~ /-/ && $alias !~ /-/;
push @last_aliases, "$variation-$alias" if $variation !~ / / && $alias !~ / /;
}
}
}
}
else{
push @warnings, "Too many parts in last name: '$lastname'\n";
push @last_aliases, $lastname;
}
my @aliases;
unshift @first_aliases, $firstname if $reduced;
ALIAS:
for my $f ( @first_aliases ) {
for my $l (@last_aliases) {
push @aliases, { firstname => $f, lastname => $l };
if( scalar @aliases > 25 ){
push @warnings, 'More than 25 aliases';
last ALIAS;
}
}
}
return \@warnings, @aliases;
}
sub variations {
my $part = shift;
my @parts = split /\+/, $part;
if( scalar @parts <= 1 ){
return $part;
}
else{
return join( ' ', @parts ), $parts[-1];
}
}
sub rmTags {
my $in = shift;
while ($in =~ s/(<|(?:\<))\/?[^>]*?(>|(?:\>))/ /g) {};
return $in;
}
sub rmDiacritics {
my $str = shift;
my $nstr = '';
for ( $str ) { # the variable we work on
## convert to Unicode first
## if your data comes in Latin-1, then uncomment:
#$_ = Encode::decode( 'iso-8859-1', $_ );
return "" if !defined $_;
$_ = NFD( $_ ); ## decompose
s/\pM//g; ## strip combining characters
s/[^\0-\x80]//g; ## clear everything else
$nstr .= $_;
}
$nstr;
}
sub rmDiacriticsNOTGOOD {
my $str = shift;
my $nstr = '';
#
# This code (c) Ivan Kurmanov, http://ahinea.com/en/tech/accented-translate.html
#
for ( $str ) { # the variable we work on
## convert to Unicode first
## if your data comes in Latin-1, then uncomment:
#$_ = Encode::decode( 'iso-8859-1', $_ );
s/\xe4/ae/g; ## treat characters ä ñ ö ü ÿ
s/\xf1/ny/g; ## this was wrong in previous version of this doc
s/\xf6/oe/g;
s/\xfc/ue/g;
s/\xff/yu/g;
$_ = NFD( $_ ); ## decompose (Unicode Normalization Form D)
s/\pM//g; ## strip combining characters
# additional normalizations:
s/\x{00df}/ss/g; ## German beta â<80><9c>Ã<9f>â<80><9d> -> â<80><9c>ssâ<80><9d>
s/\x{00c6}/AE/g; ## Ã<86>
s/\x{00e6}/ae/g; ## æ
s/\x{0132}/IJ/g; ## IJ
s/\x{0133}/ij/g; ## ij
s/\x{0152}/Oe/g; ## Å<92>
s/\x{0153}/oe/g; ## Å<93>
tr/\x{00d0}\x{0110}\x{00f0}\x{0111}\x{0126}\x{0127}/DDddHh/; # Ã<90>Ä<90>ðÄ<91>Ħħ
tr/\x{0131}\x{0138}\x{013f}\x{0141}\x{0140}\x{0142}/ikLLll/; # ıĸĿÅ<81>Å<80>Å<82>
tr/\x{014a}\x{0149}\x{014b}\x{00d8}\x{00f8}\x{017f}/NnnOos/; # Å<8a>Å<89>Å<8b>Ã<98>øſ
tr/\x{00de}\x{0166}\x{00fe}\x{0167}/TTtt/; # Ã<9e>Ŧþŧ
s/[^\0-\x80]/ /g; ## space for everything else; optional
$nstr .= $_;
}
$nstr;
}
sub capitalize {
my $txt = shift;
my %args = @_;
#print "bef: $txt\n";
# we don't want to recapitalize when it look ok
# what doesn't look ok is a token with all lowercase (4 or more chars) or allcaps (2 or more chars)
return $txt unless $txt =~ /\b[A-Z]{2,}\b/ or $txt =~ /\b[a-z]{4,}\b/;
my $t = capitalize_title($txt);
if ($args{notSentence}) {
$t =~ s/^($PREFIXES)/lc $1/ie;
}
#fix for bug in text::capitalize
$t =~ s/&Quot;?(\.?)$/"$1/g;
return $t;
}
#
# Top 1000 most common male first names in the US according to the 1990 US survey, with relative frequency (%, first column)
#
$COMMON_MALE_FIRSTNAMES = <