Text-Names-0.46/000755 000765 000024 00000000000 13033766307 014416 5ustar00dbourgetstaff000000 000000 Text-Names-0.46/Changes000644 000765 000024 00000001542 12675336670 015722 0ustar00dbourgetstaff000000 000000 Revision history for Perl module Text::Names 0.01 2011-02-07 - original version; created by h2xs 1.23 with options -AX Text::Names 0.02 2011-02-08 - Fixed dependency issues 0.03 2011-02-08 - Improved doc 0.19 2012-04-25 - Tons of improvements 0.20 2013-03-11 - Removed annoying uninitialized variable warnings 0.30 2013-08-09 - Added guessGender subroutine 0.31 2013-08-10 - Tweaks to guessGender subroutine 0.32 2013-08-10 - Documentation 0.33 2013-08-10 - Fixed Changes file 0.39 2014-03-09 - Fixed issues with reversed names without commas 0.40 2014-04-05 - Fixed issues with O'* names 0.41 2015-01-05 - Added firstnamePrevalence() and surnamePrevalence() 0.42 2015-04-01 - Merged doc fixes from ugexe 0.43 2015-05-15 - Make guessGender work with full names; improved guessGender recall Text-Names-0.46/lib/000755 000765 000024 00000000000 13033766307 015164 5ustar00dbourgetstaff000000 000000 Text-Names-0.46/Makefile.PL000644 000765 000024 00000001535 12675336670 016403 0ustar00dbourgetstaff000000 000000 use 5.0; use ExtUtils::MakeMaker; # See lib/ExtUtils/MakeMaker.pm for details of how to influence # the contents of the Makefile that is written. WriteMakefile( NAME => 'Text::Names', VERSION_FROM => 'lib/Text/Names.pm', # finds $VERSION PREREQ_PM => { 'Text::Capitalize' => 1.1, 'Text::LevenshteinXS' => 0.03, 'Test::More' => 0.96 }, # e.g., Module::Name => 1.1 ($] >= 5.005 ? ## Add these new keywords supported since 5.005 (ABSTRACT_FROM => 'lib/Text/Names.pm', # retrieve abstract from module AUTHOR => 'David Bourget') : ()), ($ExtUtils::MakeMaker::VERSION < 6.46 ? () : ( META_MERGE => { resources => { repository => 'https://github.com/dbourget/Text-Names', }, }, ) ), ); Text-Names-0.46/MANIFEST000644 000765 000024 00000000472 13033766307 015552 0ustar00dbourgetstaff000000 000000 Changes lib/Text/Names.pm Makefile.PL MANIFEST This list of files README t/01-abbreviations.t t/02-samePerson.t t/03-parseNames.t t/04-common.t META.yml Module YAML meta-data (added by MakeMaker) META.json Module JSON meta-data (added by MakeMaker) Text-Names-0.46/META.json000644 000765 000024 00000002133 13033766307 016036 0ustar00dbourgetstaff000000 000000 { "abstract" : "Perl extension for proper name parsing, normalization, recognition, and classification", "author" : [ "David Bourget" ], "dynamic_config" : 1, "generated_by" : "ExtUtils::MakeMaker version 6.66, CPAN::Meta::Converter version 2.133380", "license" : [ "unknown" ], "meta-spec" : { "url" : "http://search.cpan.org/perldoc?CPAN::Meta::Spec", "version" : "2" }, "name" : "Text-Names", "no_index" : { "directory" : [ "t", "inc" ] }, "prereqs" : { "build" : { "requires" : { "ExtUtils::MakeMaker" : "0" } }, "configure" : { "requires" : { "ExtUtils::MakeMaker" : "0" } }, "runtime" : { "requires" : { "Test::More" : "0.96", "Text::Capitalize" : "1.1", "Text::LevenshteinXS" : "0.03" } } }, "release_status" : "stable", "resources" : { "repository" : { "url" : "https://github.com/dbourget/Text-Names" } }, "version" : "0.46" } Text-Names-0.46/META.yml000644 000765 000024 00000001207 13033766307 015667 0ustar00dbourgetstaff000000 000000 --- abstract: 'Perl extension for proper name parsing, normalization, recognition, and classification' author: - 'David Bourget' build_requires: ExtUtils::MakeMaker: 0 configure_requires: ExtUtils::MakeMaker: 0 dynamic_config: 1 generated_by: 'ExtUtils::MakeMaker version 6.66, CPAN::Meta::Converter version 2.133380' license: unknown meta-spec: url: http://module-build.sourceforge.net/META-spec-v1.4.html version: 1.4 name: Text-Names no_index: directory: - t - inc requires: Test::More: 0.96 Text::Capitalize: 1.1 Text::LevenshteinXS: 0.03 resources: repository: https://github.com/dbourget/Text-Names version: 0.46 Text-Names-0.46/README000644 000765 000024 00000001122 12675336670 015301 0ustar00dbourgetstaff000000 000000 Text-Names ======================= A Perl extension for name parsing and normalization. See POD for details. INSTALLATION To install this module type the following: perl Makefile.PL make make test make install DEPENDENCIES This module requires these other modules and libraries: Text::Capitalize COPYRIGHT AND LICENCE Copyright (C) 2011-2013 by David Bourget This library is free software; you can redistribute it and/or modify it under the same terms as Perl itself, either Perl version 5.10.1 or, at your option, any later version of Perl 5 you may have available. Text-Names-0.46/t/000755 000765 000024 00000000000 13033766307 014661 5ustar00dbourgetstaff000000 000000 Text-Names-0.46/t/01-abbreviations.t000644 000765 000024 00000000434 12675336670 020124 0ustar00dbourgetstaff000000 000000 use Text::Names qw/abbreviationOf/; use Test::More; ok(abbreviationOf('Dave','David'),'Dave -> David'); ok(abbreviationOf('Mike','Michael'),'Mike -> Michael'); ok(abbreviationOf('Bella','Belinda'),'Bella -> Belinda'); ok(!abbreviationOf('John','Bob'),'John !> Bob'); done_testing; Text-Names-0.46/t/02-samePerson.t000644 000765 000024 00000002250 12675343463 017405 0ustar00dbourgetstaff000000 000000 use Text::Names qw/samePerson/; use Test::More; ok(samePerson('Dave Bourget','David Bourget'),'Dave Bourget -> David Bourget'); ok(samePerson('Dave J. Bourget','David John Bourget'),'Dave J. Bourget -> David John Bourget'); ok(samePerson('D J Bourget','David John Bourget'),'D J Bourget -> David John Bourget'); ok(samePerson('D Bourget','David John Bourget'),'D Bourget -> David John Bourget'); ok(samePerson('Bourget, David', 'Dave Bourget Jr'), 'Bourget, David -> Dave Bourget Jr'); ok(samePerson('Dave Bourget','Bourget, David F.'),'Dave Bourget -> Bourget, David F.'); ok(!samePerson('J Bourget','David John Bourget'),'J Bourget !> David John Bourget'); ok(!samePerson('D F Bourget','David John Bourget'),'D F Bourget !> David John Bourget'); ok(!samePerson('John Doe','David John Bourget'),'John Doe !> David John Bourget'); is( samePerson("Bourget, David J.","Bourget, David"), "Bourget, David J.", "Bourget, David J as return value"); is( samePerson("Bourget, David J.","Bourget, David X."), undef, "Not compatible = undef"); ok(samePerson('Fredrik Björklund','F Bjorklund'),'Björklund, Fredrik'); ok( samePerson("Bourget, David", "David, Bourget", "loose" =>1 ) ); done_testing; Text-Names-0.46/t/03-parseNames.t000644 000765 000024 00000005674 12675336670 017407 0ustar00dbourgetstaff000000 000000 use Text::Names 'parseNames','cleanName','composeName','parseName'; use Test::More; use utf8; binmode(STDOUT,":utf8"); is(cleanName("T.H. Ho"),"Ho, T. H."); is(cleanName("Ho, T. . ."),"Ho, T."); is(cleanName("D.'Arms, D."), "D'Arms, D."); is(cleanName("D.’Arms, D."), "D'Arms, D."); is(cleanName("D’Arms, D."), "D’Arms, D."); my ($f,$l) = parseName("D’Arms, D."); is($l,"D’Arms"); my %tests = ( 'Kuehni, R. G., Hardin, C. L.' => 'Kuehni, R. G.; Hardin, C. L.', 'Bourget, David; Doe, John' => 'Bourget, David; Doe, John', 'David Bourget & John Doe' => 'Bourget, David; Doe, John', 'David Bourget and John Doe' => 'Bourget, David; Doe, John', 'Bourget D, Doe J' => 'Bourget, D.; Doe, J.', 'Bourget DJR' => 'Bourget, D. J. R.', 'Bourget, D.J.R.' => 'Bourget, D. J. R.', 'Bourget D.J.R.' => 'Bourget, D. J. R.', 'D.J. Bourget' => 'Bourget, D. J.', 'Bourget, DAVID' => 'Bourget, DAVID', 'David BOURGET' => 'BOURGET, David', 'David Chalmers, David Bourget and John Doe' => 'Chalmers, David; Bourget, David; Doe, John', 'Chalmers, David, Bourget, David, Doe, John' => 'Chalmers, David; Bourget, David; Doe, John', 'Chalmers, David John, Bourget, David, Doe, John C.' => 'Chalmers, David John; Bourget, David; Doe, John C.', 'DAVID BOURGET' => 'BOURGET, DAVID', 'John Doe Jr' => 'Doe Jr, John', 'John M. Doe Jr' => 'Doe Jr, John M.', 'Dr Afsar Abbas' => 'Abbas, Afsar', 'R. de Sousa' => 'de Sousa, R.', 'Jean Claude van Damme' => 'van Damme, Jean Claude', 'Dr. Jean Claude van Damme, Prof R de Sousa' => 'van Damme, Jean Claude; de Sousa, R.', "Maureen A. O'Malley" => "O'Malley, Maureen A.", "Gusmão da Silva, Guilherme" => "Gusmão da Silva, Guilherme", "D Bourget, Zbigniew Z Lukasiak and John Doe" => "Bourget, D.; Lukasiak, Zbigniew Z.; Doe, John", "Bourget, D and John Doe" => "Bourget, D.; Doe, John", "Bourget, D, Chalmers C, and John Doe" => "Bourget, D.; Chalmers, C.; Doe, John", cleanName("Guilherme Gusmão da Silva") => "da Silva, Guilherme Gusmão", cleanName("van Untouched, Firstname") => "van Untouched, Firstname", cleanName("Van Untouched, Firstname") => "Van Untouched, Firstname", cleanName("VAN TOUCHED, firstname") => "van Touched, Firstname", cleanName("CL Adams") => "Adams, C. L.", cleanName("Hacker, PMS") => "Hacker, P. M. S.", cleanName("van fraassen b") => "van Fraassen, B.", cleanName("van fraassen b c") => "van Fraassen, B. C.", cleanName("RawlsJ.") => "Rawls, J.", cleanName("RawlsJ.C.") => "Rawls, J. C.", cleanName("RawlsJC") => "Rawls, J. C.", cleanName("McKim, John") => "McKim, John", cleanName("John McKim") => "McKim, John", cleanName("McKim") => "McKim, " ); is(cleanName("Hacker, PMS"),"Hacker, P. M. S."); is(cleanName("Doe, Bob"),"Doe, Bob"); #print cleanName("Guilherme Gusmão da Silva"); foreach my $t (keys %tests) { my $r = join('; ',parseNames($t)); is( $r, $tests{$t}, "$t -> $r" ); } done_testing; Text-Names-0.46/t/04-common.t000644 000765 000024 00000003327 13033732023 016550 0ustar00dbourgetstaff000000 000000 use Test::More; use Text::Names qw/isCommonSurname isCommonFirstname guessGender firstnamePrevalence surnamePrevalence isLikelyMisparsed/; ok(isCommonSurname('Smith'),'No threshold test, positive'); ok(isCommonSurname('Kennedy'),'No threshold test, positive'); ok(!isCommonSurname('Sdakljdslkafdjdfsa'),'No threshold test, negative'); ok(isCommonSurname('Smith',0.01),'Threshold test, positive'); ok(!isCommonSurname('Smith',5),'Threshold test, negative'); ok(isCommonFirstname('Mary'),'Firstname no threshold test, positive'); ok(isCommonFirstname('David'),'Firstname no threshold test, positive'); ok(!isCommonFirstname('Aklajldkfsjfd'),'Firstname no threshold test, negative'); ok(isCommonFirstname('Mary',0.5),'Firstname threshold test, positive'); ok(isCommonFirstname('John',1),'Firstname threshold test, positive'); ok(!isCommonFirstname('Mark',3),'Firstname threshold test, negative'); ok(guessGender('David') eq 'M'); ok(guessGender('lkjasdf') == undef); ok(guessGender('Mary') eq 'F'); ok(guessGender('Arthur') eq 'M'); ok(guessGender('William') eq 'M'); is(guessGender('Christian'), 'M'); is(guessGender('Arthur Flintstone'), 'M'); is(guessGender("Christian Loew"),"M"); is(guessGender('Natalia'), 'F'); is(guessGender('Ana'), 'F'); is(guessGender('Eleni'), 'F'); ok(firstnamePrevalence('David') > 1); ok(surnamePrevalence('Smith') > 1); ok(firstnamePrevalence('Angela') > 0); ok(surnamePrevalence('Bourvici') == 0); ok(isLikelyMisparsed('Bourget, David John Richard Bill')); ok(isLikelyMisparsed('Alexia, Smith')); ok(!isLikelyMisparsed('Bourget, David')); ok(isLikelyMisparsed('Bourget, David Dr'), 'Misparsed is likely misparsed'); ok(isLikelyMisparsed('Gilbert Daniel R.')); ok(!isLikelyMisparsed('Smith, Daniel R.')); done_testing; Text-Names-0.46/lib/Text/000755 000765 000024 00000000000 13033766307 016110 5ustar00dbourgetstaff000000 000000 Text-Names-0.46/lib/Text/Names.pm000644 000765 000024 00001030007 13033766144 017511 0ustar00dbourgetstaff000000 000000 package Text::Names; use 5.0; use strict; use warnings; use Text::Capitalize qw(capitalize_title @exceptions); use Text::LevenshteinXS qw(distance); use Unicode::Normalize; use utf8; require Exporter; our @ISA = qw(Exporter); our %EXPORT_TAGS = ( 'all' => [ qw( @NAME_PREFIXES abbreviationOf reverseName cleanParseName parseName parseName2 normalizeNameWhitespace samePerson sameAuthors parseNames parseNameList cleanNames cleanName weakenings composeName abbreviationOf setNameAbbreviations getNameAbbreviations isCommonSurname isCommonFirstname guessGender firstnamePrevalence surnamePrevalence isMisparsed isLikelyMisparsed ) ] ); our @EXPORT_OK = ( @{ $EXPORT_TAGS{'all'} } ); our @EXPORT = (); our $VERSION = '0.46'; # # Follows a ton of data this module uses and makes available # Search for 'CODE' (all caps) to skip ahead to the code # our @NAME_PREFIXES = qw(de di du da le la van von der den des ten ter y e); @Text::Capitalize::exceptions = qw( a an the as s on is its für à les des au aux o y and or nor for but so yet to of by at for but in with has quot amp ); push @Text::Capitalize::exceptions, @NAME_PREFIXES; my $APOST = "(?:’|')"; $Text::Capitalize::word_rule = qr{ ([^\w\s]*) # $1 - leading punctuation # (e.g. ellipsis, leading apostrophe) ([\w']*) # $2 - the word itself (includes non-leading apostrophes AND HTML ENTITIES) ([^\w\s]*) # $3 - trailing punctuation # (e.g. comma, ellipsis, period) (\s*) # $4 - trailing whitespace # (usually " ", though at EOL prob "") }x ; our @ABBREVIATIONS = ( 'Ab' => 'Abner', 'Abbie' => 'Abigail', 'Abby' => 'Abigail', 'Abe' => 'Abel', 'Abe' => 'Abraham', 'Abe' => 'Abram', 'Acer' => 'Acera', 'Ada' => 'Adeline', 'Addie' => 'Adelaide', 'Ag' => 'Agatha', 'Aggy' => 'Agatha', 'Agnes' => 'Agatha', 'Agnes' => 'Inez', 'Al' => 'Albert', 'Al' => 'Alexander', 'Al' => 'Alfred', 'Alec' => 'Alexander', 'Alex' => 'Alexander', 'Alf' => 'Alfred', 'Amy' => 'Amanda', 'Amy' => 'Amelia', 'Andy' => 'Andreas', 'Andy' => 'Andrew', 'Angie' => 'Angeline', 'Ann' => 'Deanne', 'Ann' => 'Hannah', 'Ann' => 'Susanna', 'Anna' => 'Hannah', 'Anna' => 'Susanna', 'Anne' => 'Hannah', 'Anne' => 'Susanna', 'Annette' => 'Ann', 'Annette' => 'Anna', 'Annie' => 'Ann', 'Annie' => 'Anna', 'Annie' => 'Hannah', 'Annie' => 'Susanna', 'Appy' => 'Apollonia', 'Archy' => 'Archibald', 'Arnie' => 'Arnold', 'Arny' => 'Arnold', 'Art' => 'Artemis', 'Art' => 'Arthur', 'Arty' => 'Arthur', 'Bab' => 'Barbara', 'Babs' => 'Barbara', 'Barb' => 'Barbara', 'Barney' => 'Barnabas', 'Bart' => 'Bartholomew', 'Barty' => 'Bartholomew', 'Bass' => 'Sebastian', 'Bea' => 'Beatta', 'Bea' => 'Beatrice', 'Beattie' => 'Beatrice', 'Becky' => 'Rebecca', 'Bella' => 'Arabella', 'Bella' => 'Belinda', 'Bella' => 'Elizabeth', 'Bella' => 'Isabel', 'Bella' => 'Isabella', 'Bella' => 'Mirabel', 'Belle' => 'Mabel', 'Belle' => 'Sybil', 'Ben' => 'Benedict', 'Ben' => 'Benjamin', 'Bert' => 'Delbert', 'Bert' => 'Egbert', 'Bertie' => 'Albert', 'Bertie' => 'Gilbert', 'Bess' => 'Elizabeth', 'Bessie' => 'Elizabeth', 'Beth' => 'Elizabeth', 'Beto' => 'Alberto', 'Betsy' => 'Elizabeth', 'Betty' => 'Elizabeth', 'Bev' => 'Beverly', 'Bill' => 'William', 'Bob' => 'Robert', 'Burt' => 'Egbert', 'Cal' => 'Caleb', 'Cal' => 'Calvin', 'Carol' => 'Caroline', 'Cassie' => 'Cassandra', 'Cathy' => 'Catherine', 'Caty' => 'Catherine', 'Cecily' => 'Cecilia', 'Charlie' => 'Charles', 'Chet' => 'Chester', 'Chris' => 'Christian', 'Chris' => 'Christine', 'Chris' => 'Crystal', 'Chuck' => 'Charles', 'Cindy' => 'Cynthia', 'Cindy' => 'Lucinda', 'Cissy' => 'Cecilia', 'Cissy' => 'Clarissa', 'Claus' => 'Nicholas', 'Cleat' => 'Cleatus', 'Clem' => 'Clement', 'Clem' => 'Clementine', 'Cliff' => 'Clifford', 'Cliff' => 'Clifton', 'Clo' => 'Chloe', 'Connie' => 'Constance', 'Connie' => 'Cornelia', 'Conny' => 'Cornelia', 'Cora' => 'Corinne', 'Corky' => 'Courtney', 'Cory' => 'Cornelius', 'Creasey' => 'Lucretia', 'Crissy' => 'Christina', 'Crissy' => 'Christine', 'Cy' => 'Cyrus', 'Cyndi' => 'Cynthia', 'Daisy' => 'Margaret', 'Dan' => 'Daniel', 'Danny' => 'Daniel', 'Dave' => 'David', 'Davy' => 'David', 'Deb' => 'Deborah', 'Debby' => 'Deborah', 'Dee' => 'Deanne', 'Deedee' => 'Diedre', 'Delia' => 'Bridget', 'Delia' => 'Cordelia', 'Delia' => 'Fidelia', 'Della' => 'Delilah', 'Derick' => 'Frederick', 'Di' => 'Diana', 'Di' => 'Diane', 'Dicey' => 'Edith', 'Dicey' => 'Elizabeth', 'Dicey' => 'Eurydice', 'Dick' => 'Richard', 'Didi' => 'Diana', 'Didi' => 'Diane', 'Doc' => 'name given to 7th child', 'Doctor' => 'name given to 7th child', 'Dodie' => 'Delores', 'Dolly' => 'Dorothy', 'Dolly' => 'Margaret', 'Dolly' => 'Martha', 'Dora' => 'Dorothy', 'Dora' => 'Eudora', 'Dora' => 'Isadora', 'Dotty' => 'Dorothy', 'Doug' => 'Douglas', 'Drew' => 'Andrew', 'Eck' => 'Alexander', 'Ed' => 'Edmund', 'Ed' => 'Edward', 'Edie' => 'Edith', 'Effie' => 'Euphemia', 'Elaine' => 'Eleanor', 'Eli' => 'Elijah', 'Eli' => 'Elisha', 'Ella' => 'Eleanor', 'Ella' => 'Gabriella', 'Ella' => 'Luella', 'Ellen' => 'Eleanor', 'Ellie' => 'Danielle', 'Ellie' => 'Eleanor', 'Ellie' => 'Emily', 'Ellie' => 'Gabriella', 'Ellie' => 'Luella', 'Elly' => 'Eleanor', 'Eloise' => 'Heloise', 'Elsie' => 'Elizabeth', 'Emily' => 'Emeline', 'Emma' => 'Emily', 'Eph' => 'Ephraim', 'Erma' => 'Emily', 'Erna' => 'Earnestine', 'Ernie' => 'Earnest', 'Ernie' => 'Earnestine', 'Etta' => 'Loretta', 'Ev' => 'Evangeline', 'Ev' => 'Evelyn', 'Eve' => 'Evelyn', 'Evie' => 'Evelyn', 'Fan' => 'Frances', 'Fanny' => 'Frances', 'Fanny' => 'Veronica', 'Fay' => 'Faith', 'Fina' => 'Josephine', 'Flo' => 'Florence', 'Flora' => 'Florence', 'Flossie' => 'Florence', 'Fran' => 'Frances', 'Frances' => 'Franziska', 'Frank' => 'Francis', 'Frank' => 'Franklin', 'Frankie' => 'Frances', 'Fred' => 'Ferdinand', 'Fred' => 'Frederick', 'Fred' => 'Gottfried', 'Freddie' => 'Frederick', 'Fritz' => 'Frederick', 'Gab' => 'Gabriel', 'Gabby' => 'Gabrielle', 'Gabe' => 'Gabriel', 'Gene' => 'Eugene', 'Genny' => 'Gwenevere', 'Geoff' => 'Geoffrey', 'Gerry' => 'Gerald', 'Gus' => 'Augustus', 'Gus' => 'Gustaf', 'Ham' => 'Hamilton', 'Hank' => 'Henry', 'Hanna' => 'Johanna', 'Hans' => 'Johan', 'Hans' => 'Johannes', 'Harry' => 'Henry', 'Harry' => 'Horace', 'Helen' => 'Eleanor', 'Hester' => 'Esther', 'Ibby' => 'Elizabeth', 'Iggy' => 'Ignatius', 'Issy' => 'Isabella', 'Issy' => 'Isadora', 'Jack' => 'John', 'Jack' => 'Jackson', 'Jackie' => 'Jacqueline', 'Jake' => 'Jacob', 'Jan' => 'Jennifer', 'Jane' => 'Janet', 'Jane' => 'Virginia', 'Jed' => 'Jedediah', 'Jeff' => 'Jeffrey', 'Jennifer' => 'Winifred', 'Jenny' => 'Jennifer', 'Jeremy' => 'Jeremiah', 'Jerry' => 'Jeremiah', 'Jill' => 'Julia', 'Jim' => 'James', 'Jimmy' => 'James', 'Joe' => 'Joseph', 'Joey' => 'Joseph', 'Johnny' => 'John', 'Jon' => 'Jonathan', 'Josh' => 'Joshua', 'Josie' => 'Josephine', 'Joy' => 'Joyce', 'Judy' => 'Judith', 'Kate' => 'Catherine', 'Kathy' => 'Katherine', 'Kathy' => 'Kathlene', 'Katie' => 'Katherine', 'Kissy' => 'Calista', 'Kit' => 'Christopher', 'Kitty' => 'Catherine', 'Klaus' => 'Nicholas', 'Lana' => 'Eleanor', 'Len' => 'Leonard', 'Lena' => 'Magdalena', 'Leno' => 'Felipe', 'Lenora' => 'Eleanor', 'Leo' => 'Leonard', 'Leon' => 'Leonard', 'Lettie' => 'Letitia', 'Lew' => 'Lewis', 'Libby' => 'Elizabeth', 'Lila' => 'Delilah', 'Lisa' => 'Elisa', 'Liz' => 'Elizabeth', 'Liza' => 'Elizabeth', 'Lizzie' => 'Elizabeth', 'Lola' => 'Delores', 'Lorrie' => 'Lorraine', 'Lottie' => 'Charlotte', 'Lou' => 'Louis', 'Louie' => 'Louis', 'Lucy' => 'Lucille', 'Lucy' => 'Lucinda', 'Mabel' => 'Mehitable', 'Maddie' => 'Madeline', 'Maddy' => 'Madeline', 'Madge' => 'Margaret', 'Maggie' => 'Margaret', 'Maggy' => 'Margaret', 'Mame' => 'Margaret', 'Mame' => 'Mary', 'Mamie' => 'Margaret', 'Mamie' => 'Mary', 'Manda' => 'Amanda', 'Mandy' => 'Amanda', 'Mandy' => 'Samantha', 'Manny' => 'Emanuel', 'Manthy' => 'Samantha', 'Marcy' => 'Marcia', 'Marge' => 'Margaret', 'Marge' => 'Marjorie', 'Margie' => 'Margaret', 'Margie' => 'Marjorie', 'Marty' => 'Martha', 'Marv' => 'Marvin', 'Mat' => 'Mathew', 'Matt' => 'Mathew', 'Matt' => 'Matthias', 'Maud' => 'Magdalene', 'Maud' => 'Matilda', 'Maude' => 'Magdalene', 'Maude' => 'Matilda', 'Maury' => 'Maurice', 'Max' => 'Maximilian', 'Max' => 'Maxwell', 'May' => 'Margaret', 'Meg' => 'Margaret', 'Mel' => 'Melvin', 'Mena' => 'Philomena', 'Merv' => 'Mervin', 'Meta' => 'Margareta', 'Mick' => 'Michael', 'Mickey' => 'Michael', 'Midge' => 'Margaret', 'Mike' => 'Michael', 'Millie' => 'Emeline', 'Milly' => 'Millicent', 'Milt' => 'Milton', 'Mimi' => 'Mary', 'Mimi' => 'Wilhelmina', 'Mina' => 'Wilhelmina', 'Mini' => 'Minerva', 'Minnie' => 'Minerva', 'Mira' => 'Elmira', 'Mira' => 'Mirabel', 'Mischa' => 'Michael', 'Mitch' => 'Mitchell', 'Moll' => 'Martha', 'Moll' => 'Mary', 'Molly' => 'Martha', 'Molly' => 'Mary', 'Mona' => 'Ramona', 'Mort' => 'Mortimer', 'Mort' => 'Morton', 'Morty' => 'Mortimer', 'Morty' => 'Morton', 'Mur' => 'Muriel', 'Myra' => 'Almira', 'Nab' => 'Abel', 'Nabby' => 'Abigail', 'Nacho' => 'Ignacio', 'Nadia' => 'Nadine', 'Nan' => 'Ann', 'Nan' => 'Hannah', 'Nan' => 'Nancy', 'Nana' => 'Ann', 'Nana' => 'Hannah', 'Nana' => 'Nancy', 'Nate' => 'Nathan', 'Nate' => 'Nathaniel', 'Ned' => 'Edmund', 'Ned' => 'Edward', 'Ned' => 'Norton', 'Neely' => 'Cornelia', 'Neil' => 'Cornelius', 'Neil' => 'Edward', 'Nell' => 'Cornelia', 'Nell' => 'Eleanor', 'Nell' => 'Ellen', 'Nell' => 'Helen', 'Nellie' => 'Helen', 'Nelly' => 'Cornelia', 'Nelly' => 'Eleanor', 'Nelly' => 'Helen', 'Nessie' => 'Agnes', 'Nettie' => 'Jeanette', 'Netty' => 'Henrietta', 'Nicie' => 'Eunice', 'Nick' => 'Dominic', 'Nick' => 'Nicholas', 'Nicy' => 'Eunice', 'Nikki' => 'Nicole', 'Nina' => 'Ann', 'Nita' => 'Anita', 'Nita' => 'Juanita', 'Noni' => 'Ione', 'Noni' => 'Nora', 'Noni' => 'Eleanor', 'Noni' => 'Elnora', 'Nora' => 'Eleanor', 'Nora' => 'Elnora', 'Nora' => 'Honora', 'Norm' => 'Norman', 'Obed' => 'Obediah', 'Ollie' => 'Oliver', 'Ora' => 'Aurillia', 'Ora' => 'Corinne', 'Pablo' => 'Paul', 'Pacho' => 'Francisco', 'Paco' => 'Francisco', 'Paddy' => 'Patrick', 'Pam' => 'Pamela', 'Pancho' => 'Francisco', 'Pat' => 'Martha', 'Pat' => 'Matilda', 'Pat' => 'Patricia', 'Pat' => 'Patrick', 'Patsy' => 'Martha', 'Patsy' => 'Matilda', 'Patsy' => 'Patricia', 'Patty' => 'Martha', 'Patty' => 'Matilda', 'Patty' => 'Patricia', 'Peg' => 'Margaret', 'Peggy' => 'Margaret', 'Penny' => 'Penelope', 'Pepa' => 'Josefa', 'Pepe' => 'Jose', 'Percy' => 'Percival', 'Pete' => 'Peter', 'Phelia' => 'Orphelia', 'Phil' => 'Philip', 'Phil' => 'Phillip', 'Polly' => 'Mary', 'Polly' => 'Paula', 'Prissy' => 'Priscilla', 'Prudy' => 'Prudence', 'Quil' => 'Aquilla', 'Quillie' => 'Aquilla', 'Rafe' => 'Raphael', 'Randy' => 'Miranda', 'Randy' => 'Randall', 'Randy' => 'Randolph', 'Rasmus' => 'Erasmus', 'Ray' => 'Raymond', 'Reba' => 'Rebecca', 'Reg' => 'Reginald', 'Reggie' => 'Reginald', 'Rena' => 'Irene', 'Rich' => 'Richard', 'Rick' => 'Eric', 'Rick' => 'Frederick', 'Rick' => 'Garrick', 'Rick' => 'Patrick', 'Rick' => 'Richard', 'Rita' => 'Clarita', 'Rita' => 'Margaret', 'Rita' => 'Margarita', 'Rita' => 'Norita', 'Rob' => 'Robert', 'Rod' => 'Roderick', 'Rod' => 'Rodney', 'Rod' => 'Rodrigo', 'Rodie' => 'Rhoda', 'Ron' => 'Aaron', 'Ron' => 'Reginald', 'Ron' => 'Ronald', 'Ronnie' => 'Veronica', 'Ronny' => 'Ronald', 'Rosie' => 'Rosalind', 'Rosie' => 'Rosemary', 'Rosie' => 'Rosetta', 'Roxy' => 'Roxanne', 'Roy' => 'Leroy', 'Rudy' => 'Rudolph', 'Russ' => 'Russell', 'Sadie' => 'Sally', 'Sadie' => 'Sarah', 'Sal' => 'Sarah', 'Sally' => 'Sarah', 'Sam' => 'Samuel', 'Sandy' => 'Alexander', 'Sandy' => 'Sandra', 'Sene' => 'Asenath', 'Senga' => 'Agnes', 'Senie' => 'Asenath', 'Sherm' => 'Sherman', 'Si' => 'Cyrus', 'Si' => 'Matthias', 'Si' => 'Silas', 'Sibella' => 'Isabella', 'Sid' => 'Sidney', 'Silla' => 'Drusilla', 'Silla' => 'Priscilla', 'Silvie' => 'Silvia', 'Sis' => 'Cecilia', 'Sis' => 'Frances', 'Sissy' => 'Cecilia', 'Sol' => 'Solomon', 'Stacia' => 'Eustacia', 'Stacy' => 'Anastasia', 'Stacy' => 'Eustacia', 'Stan' => 'Stanislas', 'Stan' => 'Stanly', 'Stella' => 'Estella', 'Stella' => 'Esther', 'Steve' => 'Steven', 'Steven' => 'Stephen', 'Stew' => 'Stewart', 'Sue' => 'Susan', 'Sue' => 'Suzanne', 'Sukey' => 'Suzanna', 'Susie' => 'Susan', 'Susie' => 'Suzanne', 'Suzy' => 'Susan', 'Suzy' => 'Suzanne', 'Tad' => 'Edward', 'Tad' => 'Thadeus', 'Ted' => 'Edmund', 'Ted' => 'Edward', 'Ted' => 'Theodore', 'Teddy' => 'Edward', 'Teddy' => 'Theodore', 'Telly' => 'Aristotle', 'Terry' => 'Theresa', 'Terza' => 'Theresa', 'Tess' => 'Elizabeth', 'Tess' => 'Theresa', 'Theo' => 'Theobald', 'Theo' => 'Theodore', 'Tia' => 'Antonia', 'Tibbie' => 'Isabella', 'Tilda' => 'Matilda', 'Tilly' => 'Matilda', 'Tilly' => 'Otilia', 'Tim' => 'Timothy', 'Timmy' => 'Timothy', 'Tina' => 'Albertina', 'Tina' => 'Augustina', 'Tina' => 'Christina', 'Tina' => 'Christine', 'Tina' => 'Earnestine', 'Tina' => 'Justina', 'Tina' => 'Martina', 'Tish' => 'Letitia', 'Toby' => 'Tobias', 'Tom' => 'Thomas', 'Tony' => 'Anthony', 'Tracy' => 'Theresa', 'Trina' => 'Katherina', 'Trixie' => 'Beatrice', 'Trudi' => 'Gertrude', 'Trudy' => 'Gertrude', 'Ursie' => 'Ursula', 'Ursy' => 'Ursula', 'Vangie' => 'Evangeline', 'Vern' => 'Vernon', 'Vi' => 'Viola', 'Vi' => 'Violet', 'Vic' => 'Victor', 'Vicky' => 'Victoria', 'Vin' => 'Galvin', 'Vin' => 'Vincent', 'Vina' => 'Alvina', 'Vina' => 'Lavina', 'Vinny' => 'Vincent', 'Virg' => 'Virgil', 'Virgie' => 'Virginia', 'Viv' => 'Vivian', 'Vonnie' => 'Yvonne', 'Wally' => 'Wallace', 'Wally' => 'Walter', 'Walt' => 'Walter', 'Web' => 'Webster', 'Wendy' => 'Gwendolen', 'Wes' => 'Wesley', 'Will' => 'William', 'Willie' => 'Wilhelmina', 'Willy' => 'William', 'Winn' => 'Edwin', 'Winnie' => 'Edwina', 'Winnie' => 'Winifred', 'Woody' => 'Woodrow', 'Xina' => 'Christina', 'Yost' => 'Josef', 'Zac' => 'Isaac', 'Zach' => 'Zachariah', 'Zak' => 'Isaac', 'Zeb' => 'Zebulon', 'Zed' => 'Zedekiah', 'Zeke' => 'Ezekiel', 'Zena' => 'Albertina', 'Zeph' => 'Zephaniah' ); my %ABBREVIATIONS; my %ABBREVIATIONS_LC; setNameAbbreviations(@ABBREVIATIONS); my $AND = '(?:\s+(?:and|&|&|with)\s+)'; my $MERE_COMMA = '(?:\s*,\s*)'; my $MERE_SEMI = '(?:\s*(?:;|
|

|<\/p>)\s*)'; my $SEMI_AND = "(?:$MERE_SEMI|$AND)"; my $COMMA_AND = "(?:$MERE_COMMA$AND|$AND|$MERE_COMMA)"; my $SPACE = '(?:\s|\ |\n|\r)'; my @NAME_PREFIXES_RE; for (my $i=0; $i<=$#NAME_PREFIXES; $i++) { $NAME_PREFIXES_RE[$i] = '(?:$|^|\W)' . $NAME_PREFIXES[$i] . '(?:$|^|\W)'; } my $PREFIXES = "(?:" . join('|',@NAME_PREFIXES_RE) . ")"; # # CODE STARTS HERE # sub setNameAbbreviations { while (my $a = shift @_) { my $b = shift; $ABBREVIATIONS{$a} ||= {}; $ABBREVIATIONS{$a}->{$b} = 1; $ABBREVIATIONS_LC{lc $a} ||= {}; $ABBREVIATIONS_LC{lc $a}->{lc $b} = 1; } } sub getNameAbbreviations { return \%ABBREVIATIONS; } our %commonSurnames; our %commonMaleFirstnames; our %commonFemaleFirstnames; our $namesInitialized = 0; # These three variables initialized below my $COMMON_SURNAMES; my $COMMON_MALE_FIRSTNAMES; my $COMMON_FEMALE_FIRSTNAMES; sub firstnamePrevalence { my $name = uc shift; prepareCommonNames() unless $namesInitialized; return (($commonMaleFirstnames{$name} || 0) + ($commonFemaleFirstnames{$name} || 0))/2; } sub surnamePrevalence { my $name = uc shift; prepareCommonNames() unless $namesInitialized; return $commonSurnames{$name} || 0; } sub isCommonFirstname { my ($name, $percentLimit) = @_; $percentLimit ||= 0; return firstnamePrevalence($name) > $percentLimit; } sub isCommonSurname { my ($name, $percentLimit) = @_; $percentLimit ||= 0; return surnamePrevalence($name) > $percentLimit; } sub isMisparsed { my ($name) = @_; $name = lc $name; return 1 if $name !~ /\w.*,.*\w/; for my $prefix ((@NAME_PREFIXES,'dr','dr.','prof','prof.','jr','jr.')) { #warn "check prefix: $prefix"; return 1 if $name =~ /\b$prefix$/i; } return 1 if $name =~ /^\w\b/; return 0; } sub isLikelyMisparsed { my ($name) = @_; my @parts = parseName2($name); for my $p (@parts) { next unless $p; my @sub_parts = split(/\s+/,$p); return 1 if $#sub_parts >= 2; # three or more subparts likely a mess } # also likely misparsed if firstname containg a likely lastname AND surname contains a likely firstname my @firstname_parts = split(/\s+/, $parts[0]); my @first_surname = grep { isCommonSurname($_) } @firstname_parts; my @surname_parts = split(/\s/, $parts[2]); my @surname_first = grep { isCommonFirstname($_) } @surname_parts; return 1 if $#first_surname > -1 and $#surname_first > -1; return isMisparsed($name); } my $fem_ending = qr/(ette|ne|a)$/i; sub guessGender { my ($name) = @_; $name = uc $name; prepareCommonNames() unless $namesInitialized; # extract firstname part if necessary if ($name =~ /[,\s]/) { my @parts = parseName($name); $name = $parts[0]; } my $fscore = $commonFemaleFirstnames{$name}; my $mscore = $commonMaleFirstnames{$name}; return undef if !($fscore || $mscore); return 'F' if $fscore and !$mscore; return 'M' if $mscore and !$fscore; #warn "M: $mscore vs F: $fscore"; my $threshold = 20; # now the name exist in both. we make a decision if the percentage is very different # now in both return 'F' if $fscore / $mscore >= $threshold; return 'M' if $mscore / $fscore >= $threshold; # adjust threashold based on common female name endings my $fem_end = ($name =~ $fem_ending); if ($fem_end) { return 'F' if $fscore / $mscore >= ($threshold/4); } else { return 'M' if $mscore / $fscore >= ($threshold/4); } return undef; } sub prepareCommonNames { for my $n (split(/\n/,$COMMON_SURNAMES)) { next unless $n =~ /^(.+?)\s+(.+?)\s/; $commonSurnames{uc $1} = $2; } for my $n (split(/\n/,$COMMON_MALE_FIRSTNAMES)) { next unless $n =~ /^(.+?)\s+(.+?)\s/; $commonMaleFirstnames{$1} = $2; } for my $n (split(/\n/,$COMMON_FEMALE_FIRSTNAMES)) { next unless $n =~ /^(.+?)\s+(.+?)\s/; $commonFemaleFirstnames{$1} = $2; } $namesInitialized = 1; } sub reverseName { my $n = shift(); return undef unless defined($n); my @n = split(/,\s*/,$n); return "$n[1] $n[0]"; } sub composeName { my ($given,$last) = @_; my $r = $last; $r .= ", $given" if $given; return $r; } sub normalizeNameWhitespace { my $in = shift; return undef unless defined $in; #print "in: $in\n"; # this used to be optional, but then we never know in advance #my $initialsCanBeLowerCase = shift; #if ($initialsCanBeLowerCase) { $in =~ s/\b([a-z])\b/uc $1/ge; #} $in =~ s/^\s+//g; # remove initial spaces $in =~ s/\s+$//g; # remove term spaces $in =~ s/\s+,/,/g; # remove spaces before coma $in =~ s/,\s*/, /g; # normalize spaces after coma $in =~ s/\.\s*([A-Z])/. $1/g; # adjust spacing between initials #print "in: $in\n"; $in =~ s/([A-Z])\.\s([A-Z])\./$1. $2./g; $in =~ s/\b([A-Z])\b(?![\.'’])/$1./g; while ($in =~ s/([\.\s][A-Z])(\s|$)/$1.$2/g) {}; $in =~ s/\.\s*([A-Z])/. $1/g; # adjust spacing between initials #print "normalized: $in\n"; $in; } sub parseName { my $in = shift; return undef unless defined $in; #print "-->parseName in: $in\n"; $in =~ s/^\s*and\s+//; my $jr = ($in =~ s/,?\sJr\.?(\s|$)//i); $in =~ s/^\s*by\s+//; $in =~ s/\W*et\.? al\.?\W*//; $in =~ s/\.\s*$//; # remove . at the end #print "$in -->"; $in = normalizeNameWhitespace($in); #print "$in'\n"; # check if we have a case of Lastname I. without comma if ($in !~ /,/ and $in=~ /^(.*?\s)((?:[A-Z][\-\.\s]{0,2}){1,3})$/) { #warn "Got a reversed name without comma: $1, $2"; my $init = $2; my $rest = $1; #print "\n\nmatched, rest:$rest--$2\n"; # add . as needed # if ($init !~ /\./) { $init =~ s/([A-Z])([^.]|$)/$1.$2/g; $init =~ s/([A-Z])([^.]|$)/$1.$2/g; # } $rest =~ s/\s$//; $in = normalizeNameWhitespace("$rest, $init"); } elsif ($in =~ /^[^,]+\s\w\.?$/) { #print "case\n"; $in =~ s/^(.+?)\s((?:[A-Z]\.?-?\s?){1,3})$/$1,$2/; } #print "now:$in\n"; # standard cases if ($in =~ /(.*),\s*(.*)/) { return ($2, $1); } else { my @bits = split(' ',$in); #print join(" - ", @bits); if ($#bits == -1) { return ($in,""); } my $lastname = splice(@bits,-1,1); if ($lastname =~ /^Jr\.?$/i and $#bits > -1) { $lastname = $bits[-1] . " $lastname"; splice(@bits,-1,1); } $lastname = "$lastname Jr" if $jr; # add prefixes or Jr to lastname #warn join(" - ",@bits); while (defined $bits[-1] and $bits[-1] =~ /^$PREFIXES$/i) { #warn "GOT PREFIX: $bits[-1]"; $lastname = splice(@bits,-1,1) . " $lastname"; } return (join(' ',@bits),$lastname); #my $firstname = splice(@bits,0,1); #while ($#bits > -1 and $bits[0] =~ /^\s*\w\.?\s*$/) { # $firstname .= " ".splice(@bits,0,1); #} #my $lastname = join(' ', @bits); #return ($firstname, join(' ',@bits)); =crap my @surnames = $#bits > 1 ? @bits[1,$#bits] : ($bits[1]); #warn "doing " . join(" ",@bits); #while ($#bits > 0) { # @surnames = pop @bits; #} #my $surname = pop @bits; return ($bits[0], join(' ',@surnames)); =cut } } sub parseNames { my $in = shift; my $reverse = shift; # means names are stupidly written like this: David, Bourget return undef if !defined $in; while($in =~ s/(^|\W)(dr|prof\.? em\.?|prof|profdr|prof|sir|mrs|ms|mr)\.?(\W)/$1 $3/gi) {} $in =~ s/^\s+//; $in =~ s/([^A-Z]{2,2})\.\s*/$1/; # remove . at the end $in =~ s/\(.+\)\s*$//; # remove trailing parens $in =~ s/(,\s*)?\d\d\d\d-$//; $in =~ s/^\s*[bB]y(\W)/$1/; #remove "By "; $in =~ s/,?\s*et\.? al\.?\s*$//; # et al $in =~ s/^\W+//; #print "== $in\n"; # semi-colon separated if ($in =~ /;/) { return parseNameList(split(/$SEMI_AND/i,$in),$reverse); } # no comma and no semi-colon, so one or two not-reversed names elsif ($in !~ /,/) { return parseNameList(split(/$AND/i,$in),$reverse); } # now that's messy: one or more commas, no semi else { # is there a "and"? #print "$in\n"; if ($in =~ /$AND/i) { #print "AND:$in\n"; # now we check for double duty for commas # We fix what would be missing commas on this hypothesis my $t = $in; $t =~ s/([a-z])\s+([A-Z])(\.|\s|$)/$1, $2$3/g; # we check if it's a silly case of commas playing double duty if ($t =~ /,.+,.+,.+$AND/) { #print "SILLY: $t\n"; my @to; my @tokens = split(/$COMMA_AND/i,$t); for (my $ti=0; $ti <= $#tokens;$ti+=2) { push @to, join(", ",@tokens[$ti..$ti+1]); } return parseNameList(@to,$reverse); } # no silliness. what's after the AND will tell us the format # if there's a comma after, it's probably reversed if ($in =~ /$AND.*,/i) { return parseNameList(split(/$SEMI_AND/i,$in),$reverse); } else { my @parts = split(/$COMMA_AND/i,$in); return parseNameList(@parts,$reverse); } } else { #print "- no and\n"; # no semi, no and, and one or more comma # if 2 or more commas if ($in =~ /,.+,/) { # need to check if this is a silly case of commas with reversed names # check that by looking for two or more ,token, with only one part, and odd number of , my @tokens = split(/$MERE_COMMA/i,$in); my $silly = 0; for my $tok (@tokens) { $silly++ unless $tok =~ m/[\w\.]$SPACE[\w\.]/i; } # if silly combination, every other comma separates two names if ($silly >=2 and $#tokens %2 ==1) { my @to; for (my $ti=0; $ti <= $#tokens;$ti+=2) { push @to, join(", ",@tokens[$ti..$ti+1]); } @tokens = @to; } return parseNameList(@tokens,$reverse); } # else, one comma, no semi, and no and else { # now that's ambiguous between "Doe, John" and "John Doe, John Doe" # but we assume there are no names like "Herrera Abreu, Maria Teresa" # (which there are, this is a real one). that is, if the comma separates # two tokens on each side (not counting de,di,von, etc.), we suppose # these tokens make distinct names my @toks = split(/,/,$in); my @copy = @toks; foreach (@copy) { s/$PREFIXES|(\WJr(\W|$))/ /ig; my @bits = split(' ',$_); if ($#bits <= 0) { # found one side with only one non-trivial token # so there is only one author in $in return parseNameList(($in),$reverse); } } return parseNameList(@toks,$reverse); } } } return (); } sub parseNameList { my @auths; #print "Got: " . join("---", @auths) . "\n"; my $reverse; if (defined($_[-1]) and $_[-1] eq 'reverse') { pop @_; $reverse = 1; } # first we correct for overly split names like 'Bourget; D; John Doe' my @new; #print Dumper(\@_); use Data::Dumper; for (@_) { # if the part looks like an initial, we add it to the previous name part next unless $_; if (/^([A-Z](\.|\s|$)\s?)+$/ and $#new > -1) { if ($new[-1] =~ /,/) { $new[-1] = "$new[-1] $_"; } else { $new[-1] = "$new[-1], $_"; } } else { push @new, $_; } } foreach my $a (@new) { next unless $a; my ($f,$l) = parseName($a); push @auths, ($reverse ? "$f, $l" : "$l, $f"); } return @auths; } sub parseName2 { my $in = shift; my ($i,$s); return ("","") unless defined $in; my ($l,$f) = split(/,\s*/,$in); $f ||= ''; $l ||= ''; # get suffix if ($l =~ s/\s+(Jr\.?|[IV]{2,10})\s*$//) { $s = $1; } #print "f: $f\nl:$l\n"; # separate firstname/initial # if has only initials if ($f =~ /^\s*([A-Z](?:\.|\s|$))(.+)$/) { $f = $1; $i = $2; $i =~ s/^\s*//; } # has a full firstname else { if ($f =~ /^([^\s]+?)\s+((?:[A-Z](?:\.|\s+|$)\s*)+)$/) { $f = $1; $i = $2; } } return ($f,$i,$l,$s); } sub abbreviationOf { my ($a,$b) = @_; return 0 unless $ABBREVIATIONS_LC{lc $a}; return $ABBREVIATIONS_LC{lc $a}->{lc $b} ? 1 : 0; } # if the two names passed as params are such that they could belong to the same person, returns a merged name sub samePerson { my ($a,$b, %opts) = @_; #name1,name2 return undef if !defined($a) or !defined($b); my $a_expd = 0; my $b_expd = 0; my ($lasta,$firsta) = split(',',cleanName($a,' ','reparse')); my ($lastb,$firstb) = split(',',cleanName($b,' ','reparse')); return undef if defined($firsta) and !defined($firstb); return undef if defined($firstb) and !defined($firsta); return undef if defined($lasta) and !defined($lastb); return undef if defined($lastb) and !defined($lasta); #print "here '$lasta'-'$lastb'\n"; $lasta =~ s/\s+Jr\.?$// if defined $lasta; $lastb =~ s/\s+Jr\.?$// if defined $lastb; # check for reversed name if loose if (!equivtext($lasta,$lastb)) { if (!$opts{loose}) { return undef; } else { return samePerson("$firsta, $lasta", "$lastb, $firstb", loose=>0) || samePerson("$firsta $lasta","$firstb $lastb", loose=>0); } } =old # regimentation $firsta =~ s/\./ /g; $firstb =~ s/\./ /g; $firsta =~ s/\s+/ /g; $firstb =~ s/\s+/ /g; =cut my @at = split(" ",$firsta || ''); my @bt = split(" ",$firstb || ''); #print "AT: " . join("-",@at) . "\n"; #print "BT: " . join("-",@bt) . "\n"; # compare each token pair as follows: # if reached the end of tokens on either side, compat # if both are greater than 1 char and diff, not compat # if they don't start by the same letter, not compat # else merge the tokens, compat so far, move on to next token pair # my $merged = "$lasta,"; for (my $i=0; $i <= $#at || $i <= $#bt; $i++) { #print "$i ($merged):" . $at[$i] . "-" . $bt[$i]. "-\n"; # end of tokens reached on one side if ($i > $#at) { #print "END ($merged)\n"; #return undef if $b_expd; $merged .= " ". join(" ",@bt[$i..$#bt]); return cleanName($merged,''); } elsif ($i > $#bt) { #print "END ($merged)\n"; #return undef if $a_expd; $merged .= " ". join(" ",@at[$i..$#at]); return cleanName($merged,''); } # if different tokens if (!equivtext($at[$i],$bt[$i])) { # if different first letters, not compat return undef if !equivtext(substr($at[$i],0,1),substr($bt[$i],0,1)); # otherwise they might be compatible # token a is full word if (length($at[$i]) > 2) { # b is too, they are not compat unless one is a short for the other if (length($bt[$i]) > 2) { if ( abbreviationOf($at[$i],$bt[$i]) ) { $merged .= " " . $bt[$i]; next; } elsif ( abbreviationOf($bt[$i],$at[$i]) ) { $merged .= " " . $at[$i]; next; } else { return undef; } } # b is initial, they are compat so far else { $b_expd = 1; $merged .= " " . $at[$i]; } # a is initial } else { # b is full word $a_expd = 1 if length($bt[$i]) > 2; # keep going $merged .= " " . $bt[$i]; } } # otherwise move on to next token pair straight else { $merged .= " " .$at[$i]; } } # if we get there, the two names are compatible and $merged contains the richest name from the two # print "merged: $merged\n"; return cleanName($merged,''); } sub equivtext { my ($a,$b) = @_; $a = lc rmDiacritics($a); $b = lc rmDiacritics($b); $a =~ s/\.\s*$//; $b =~ s/\.\s*$//; #warn "$a == $b == " . distance($a,$b); # we allow one character difference, except for initial tokens. this is because some diacritics change the underlying letter when removed, e.g. Björklund -> Bjarklund, even though one might naturally write 'Bjorklund' return (length($a) > 1 && length($b) > 1) ? distance($a,$b) <= 1 : $a eq $b; } sub cleanParseName { my $n = shift; # I think that one is overkill.. return parseName(cleanName(composeName(parseName($n)))); } sub cleanName { my ($n, $space, $reparse) = @_; # Some of the cleaning-up here is redundant because also in parseName, which is called last. But it doesn't hurt.. If it works don't try and fix it. return undef unless defined $n; #print "Cleaning name: $n\n"; $n =~ s/\.( \.)+/./g; $n =~ s/\.($APOST)/'/g; # if ", john doe" if ($n =~ s/^\s*,\s+//) { } # if 'john doe,' if ($n =~ s/^([^,]+?)\s*,\s*$/$1/) { } $n =~ s/Get checked abstract//g; $n = rmTags($n); $n =~ s/, By$//i; #if ($reparse and $n =~ s/,/) { # my ($l,$f) = split(/,\s*/,$n); # my ($f,$l) = parseName(join(' ',($f,$l))); # $n = "$l, $f"; #} # Fix for O'Something => O.'Something #$n =~ s/O\.'/O'/; $n =~ s/[\r\n]/ /gsm; $n =~ s/(\w)\s*,/$1,/g; $n =~ s/([a-z]{2,})\./$1/gi; #remove unwanted . $n =~ s/(\W|\.|\s)([A-Z])(\s|$)/$1$2.$3/g; #add . to initials $n =~ s/(\W|\.|\s)([A-Z])(\s|$)/$1$2.$3/g; #add . to initials (repeat for overlaps) $n =~ s/\.\s*([A-Z])/". " . uc $1/ieg; # adjust spacing between initials $n =~ s/\W*\d{4,4}\W*//g; # misplaced dates $n =~ s/\(.*$//; #parentheses # misplaced jr $n =~ s/([\w'-])\s*,(.*)\sJr(\s.*|$)/$1 Jr,$2 $3/i; # misplaced prefixe #warn $n; #$n =~ s/([\w'-])\s*,(.*)\s(van|von|von\sder|van\sder|di|de|del|du|da)(\s.*|$)/(lc $3) . $1 . "," . $2 . $4/ie; #warn $n; # replace Iep by UNKNOWN (for PP) $n =~ s/^Iep,$/Unknown, Unknown/; #links aren't names $n = "Unknown, Unknown" if $n =~ /http:\/\//; # capitalize if nocaps if ($n !~ /[A-Z]/) { $n = capitalize($n,notSentence=>1);#_title($n, PRESERVE_ANYCAPS=>1, NOT_CAPITALIZED=>\@PREFIXES); } # do we have initials stuck on the surname like so: RawlsJ. unless ($n =~ /,/ or $n =~ /\w \w/) { $n =~ s/([A-Z][a-z]{1,})((?:[A-Z](?:$|\.|\s|)\s*)+)\s*$/$1, $2/g; } #warn "$n"; #unless it's all caps, the caps are initials. we unstuck them and add . if ($n =~ /[a-z]/ and $n !~ /[A-Z]{2,} [A-Z]{2,}/) { $n =~ s/(\s|^)([A-Z]{2,3})(\.|\s|$)/$1 . toInitials($2) . $3/ge; } else { } #warn $n; my ($f,$l) = parseName($n); #warn "** $l, $f"; #warn "$l, $f"; $n = composeName($f,$l); # now final capitalization $n = capitalize($n,notSentence=>1); #,PRESERVE_ANYCAPS=>1, NOT_CAPITALIZED=>\@PREFIXES); return $n; } sub toInitials { my $s = shift; #warn "init: $s"; return $s if grep { lc $_ eq lc $s } @NAME_PREFIXES; $s =~ s/^([A-Z])([A-Z])([A-Z])$/$1. $2. $3./; $s =~ s/^([A-Z])([A-Z])$/$1. $2./; return $s; } sub weakenings { my( $firstname, $lastname ) = @_; my @warnings; # default firstname aliases: every middle name can be either in full, initialized, or absent my @first_parts = split(/\s+/,normalizeNameWhitespace($firstname)); my $reduced = 0; if( scalar @first_parts > 3 ){ $reduced = 1; splice( @first_parts, 3 ); push @warnings, "Too many parts in first name: '$firstname'\n"; } my $first = shift @first_parts; for my $i (0..$#first_parts) { my $value = $first_parts[$i]; $first_parts[$i] = [$value]; # the default value is good # try downgrading to initial push @{$first_parts[$i]}, $value if ($value =~ s/(\w)[^\s\.]+/$1./); } my @first_aliases = ( $first ); push @first_aliases, "$1." if $first =~ /(\w)[^\s\.]+/; #print Dumper(\@first_parts); for my $i (0..$#first_parts) { my @new; for my $current (@first_aliases) { for (@{$first_parts[$i]}) { push @new, "$current $_"; } push @new, $current; } @first_aliases = @new; } #print Dumper(\@first_aliases); $lastname = normalizeNameWhitespace($lastname); my @prefixes = map "\\b$_\\b", @NAME_PREFIXES, 'y', 'los'; my $prefixes = join '|', @prefixes; $lastname =~ s/($prefixes) /$1+/ig; my @parts = reverse ( ( split(/\s+/,$lastname) ) ); my @last_aliases; my $lastlast = shift @parts; for my $variation ( variations( $lastlast ) ){ push @last_aliases, $variation; } if( scalar @parts < 3 ){ for my $lpart ( @parts ){ my @curr = @last_aliases; for my $variation ( variations( $lpart ) ){ for my $alias ( @curr ){ next if $variation =~ /-/ && $alias =~ / /; next if $variation =~ / / && $alias =~ /-/; push @last_aliases, "$variation $alias" if $variation !~ /-/ && $alias !~ /-/; push @last_aliases, "$variation-$alias" if $variation !~ / / && $alias !~ / /; } } } } else{ push @warnings, "Too many parts in last name: '$lastname'\n"; push @last_aliases, $lastname; } my @aliases; unshift @first_aliases, $firstname if $reduced; ALIAS: for my $f ( @first_aliases ) { for my $l (@last_aliases) { push @aliases, { firstname => $f, lastname => $l }; if( scalar @aliases > 25 ){ push @warnings, 'More than 25 aliases'; last ALIAS; } } } return \@warnings, @aliases; } sub variations { my $part = shift; my @parts = split /\+/, $part; if( scalar @parts <= 1 ){ return $part; } else{ return join( ' ', @parts ), $parts[-1]; } } sub rmTags { my $in = shift; while ($in =~ s/(<|(?:\<))\/?[^>]*?(>|(?:\>))/ /g) {}; return $in; } sub rmDiacritics { my $str = shift; my $nstr = ''; for ( $str ) { # the variable we work on ## convert to Unicode first ## if your data comes in Latin-1, then uncomment: #$_ = Encode::decode( 'iso-8859-1', $_ ); return "" if !defined $_; $_ = NFD( $_ ); ## decompose s/\pM//g; ## strip combining characters s/[^\0-\x80]//g; ## clear everything else $nstr .= $_; } $nstr; } sub rmDiacriticsNOTGOOD { my $str = shift; my $nstr = ''; # # This code (c) Ivan Kurmanov, http://ahinea.com/en/tech/accented-translate.html # for ( $str ) { # the variable we work on ## convert to Unicode first ## if your data comes in Latin-1, then uncomment: #$_ = Encode::decode( 'iso-8859-1', $_ ); s/\xe4/ae/g; ## treat characters ä ñ ö ü ÿ s/\xf1/ny/g; ## this was wrong in previous version of this doc s/\xf6/oe/g; s/\xfc/ue/g; s/\xff/yu/g; $_ = NFD( $_ ); ## decompose (Unicode Normalization Form D) s/\pM//g; ## strip combining characters # additional normalizations: s/\x{00df}/ss/g; ## German beta â<80><9c>Ã<9f>â<80><9d> -> â<80><9c>ssâ<80><9d> s/\x{00c6}/AE/g; ## Ã<86> s/\x{00e6}/ae/g; ## æ s/\x{0132}/IJ/g; ## IJ s/\x{0133}/ij/g; ## ij s/\x{0152}/Oe/g; ## Å<92> s/\x{0153}/oe/g; ## Å<93> tr/\x{00d0}\x{0110}\x{00f0}\x{0111}\x{0126}\x{0127}/DDddHh/; # Ã<90>Ä<90>ðÄ<91>Ħħ tr/\x{0131}\x{0138}\x{013f}\x{0141}\x{0140}\x{0142}/ikLLll/; # ıĸĿÅ<81>Å<80>Å<82> tr/\x{014a}\x{0149}\x{014b}\x{00d8}\x{00f8}\x{017f}/NnnOos/; # Å<8a>Å<89>Å<8b>Ã<98>øſ tr/\x{00de}\x{0166}\x{00fe}\x{0167}/TTtt/; # Ã<9e>Ŧþŧ s/[^\0-\x80]/ /g; ## space for everything else; optional $nstr .= $_; } $nstr; } sub capitalize { my $txt = shift; my %args = @_; #print "bef: $txt\n"; # we don't want to recapitalize when it look ok # what doesn't look ok is a token with all lowercase (4 or more chars) or allcaps (2 or more chars) return $txt unless $txt =~ /\b[A-Z]{2,}\b/ or $txt =~ /\b[a-z]{4,}\b/; my $t = capitalize_title($txt); if ($args{notSentence}) { $t =~ s/^($PREFIXES)/lc $1/ie; } #fix for bug in text::capitalize $t =~ s/&Quot;?(\.?)$/"$1/g; return $t; } # # Top 1000 most common male first names in the US according to the 1990 US survey, with relative frequency (%, first column) # $COMMON_MALE_FIRSTNAMES = < 'David', 'Davy' => 'David', etc. =head2 getNameAbbreviations Returns the abbreviation mapping. =head2 weakenings(string first_name, string last_name): array Returns an array of normalized names which are weakenings of the first and last name passed as argument. Substituting a given names by an initial, or removing an initial, for example, are operations which generate weakenings of a name. Such operations are applied with arbitrary depth, until the name has been reduced to a single initial followed by the lastname, and all intermediary steps returned. You can use weakenings(parseName("Lastname, Firstname")) to weaken a first and last name as a single string. =head2 guessGender(string firstname, [float threshold]): string Returns 'F' if someone with the provided firstname is likely female, 'M' if likely male, and undef otherwise. A frequency threshold (default = 0) can be specified so that a gender is returned only if the name is found with at least this frequency among people with this gender (according to the US census). A threshold of 0.1 (which means 0.1%) ensures very reliable results (precision above 99%) with a recall of about 60%. When the threshold is lower, this function has a tendency to overestimate the number of females. =head1 EXPORT None by default. =head1 KNOWN ISSUES This module currently overwrites @Text::Capitalize::exceptions globally, which can have unintended side-effects. =head1 SEE ALSO The xPapers application framework from which this has been extracted, http://www.xpapers.org The related L module. =head1 AUTHOR David Bourget, http://www.dbourget.com, with contributions by Zbigniew Lukasiak =head1 COPYRIGHT AND LICENSE Copyright (C) 2011-2013 by David Bourget and University of London This library is free software; you can redistribute it and/or modify it under the same terms as Perl itself, either Perl version 5.10.1 or, at your option, any later version of Perl 5 you may have available. =cut 1;