免费注册 查看新帖 |

Chinaunix

  平台 论坛 博客 文库
最近访问板块 发新帖
查看: 1550 | 回复: 0
打印 上一主题 下一主题

好心人帮忙写一下注释 [复制链接]

论坛徽章:
0
跳转到指定楼层
1 [收藏(0)] [报告]
发表于 2006-05-29 18:27 |只看该作者 |倒序浏览
package Lingua::EN::NamedEntity;

use Lingua::Stem::En;
Lingua::Stem::En::stem_caching({ -level => 2});

use 5.006;
use strict;
use warnings;
use Carp;
use Fcntl;
use DB_File;

my $wordlist = _find_file("wordlist";
our %dictionary;
tie %dictionary, "DB_File", $wordlist, O_RDONLY
  or carp "Couldn't open wordlist: $!\n";

my $forename = _find_file("forename";
our %forenames;
tie %forenames, "DB_File", $forename, O_RDONLY
  or carp "Couldn't open forename list: $!\n";


require Exporter;
our @ISA = qw(Exporter);
our @EXPORT = qw(extract_entities);

our $VERSION = '1.6';


# Regexps for constructing capitalised sequences
my $conjunctives = qr/of|and|the|&|\+/i;
my $break = qr/\b|^|$/;
my $people_initial = qr/Mrs?|Ms|Dr|Sir|Lady|Lord/;
my $people_terminal = qr/Sr|Jr|Esq/;
my $places_initial = qr/Mt|Ft|St|Lake|Mount/;
my $places_terminal = qr/St(reet)?|Ave(nue)?/i;
my $abbr = qr/$people_initial|$people_terminal|$places_initial|St/;
my $capped = qr/$break (?abbr(\.|$break)|[A-Z][a-z]* $break)/x;
my $folky = qr/-(??:in|under|over|by|the)-)+/i;
my $middle = qr/ $folky |
                                [\s-] (?conjunctives [\s-])* /x;
my $phrase = qr/$capped (?middle $capped)*/x;

my $word = qr/\s*\b\w+\b\s*/;
my $context = qr/$word{1,2}/;

sub extract_entities {
    my $text = shift;
    $text =~ s/\n/ /g;
    $text =~ s/ +/ /g;
    my @candidates;
    @candidates = _combine_contexts(
    map { _categorize_entity($_) }
     _spurn_dictionary_words(_extract_capitalized($text)));
}


sub _categorize_entity {
    my $e = shift;
    $e->{scores} = { person => 1, place => 1, organisation => 1};
    bless $e, "Lingua::EN::NamedEntity";
    $e->_definites and return $e;
    $e->_name_clues;
    $e->_place_clues;
    $e->_org_clues;
    $e->_fix_scores;
    return $e;
}

sub _definites {
    my $e = shift;
    my $ent = $e->{entity};
    if ($ent =~ /^$people_initial\.?\b/ or $ent =~ /\b$people_terminal\.?$/) {
        $e->{scores}{person} = 100;
        return 1;
    }
    if ($ent =~ /^$places_initial\.?\b/ or $ent =~ /\b$places_terminal\.?$/) {
        $e->{scores}{place} = 100;
        return 1;
    }
    return 0;
}

my $pre_name =
qr/chair|\w+man|\w+person|director|executive|manager|president|secretary|chancellor|
minister|governor|chief|deputy|head|member|officer/ix;
sub _name_clues {
    my $e = shift;
    my $ent = $e->{entity};
    my @x;
    $e->{scores}{person} += 10 if $e->{pre} =~ /(\b|^)$pre_name(\b|$)/;
    $e->{scores}{person} += 3 if (@x = split /\W+/, $ent) == 2;
    my @words = grep { exists $forenames{lc $_} } split /\W+/, $ent;
    $e->{scores}{person} += 5 * @words;
}

my $pre_place = qr/in|at/i;
sub _place_clues {
    my $e = shift;
    my @x;
    $e->{scores}{place} += 3  if (@x = split /\W+/, $e->{entity}) == 1;
    $e->{scores}{place} += 3 if $e->{pre} =~ /(^|\b)$pre_place(\b|$)/;
}

sub _org_clues {
    my $e = shift;
    my $ent = $e->{entity};
    $e->{scores}{organisation} += 10 if $ent =~ /\b(&|and|\+)\b/;
    my @words = grep { _stemmed_word_in_dictionary($_) } split /\W+/, $ent;
    $e->{scores}{organisation} += @words;
}

sub _fix_scores {
    my $e = shift;
    if (!$e->{class}) {
        $e->{class} = (sort {$e->{scores}{$b}<=>$e->{scores}{$a}} keys
        %{$e->{scores}} )[0];
    }
    return $e;
}

sub _spurn_dictionary_words {
    my @initial = @_;
    my @candidates;
    # Spurn sentence-initial dictionary words
    for my $e (@initial) {
        do { push @candidates, $e; next} if $e->{pre} and $e->{entity} =~ / /;
        my $word = lc $e->{entity};
        next if exists $dictionary{$word} ||
                _stemmed_word_in_dictionary($word);
        push @candidates, $e;
    }
    return @candidates;
}

sub _stemmed_word_in_dictionary {
    my $word = lc shift;
    my ($stemmed) = @{ Lingua::Stem::En::stem({ -words => [ $word ] }) };
    return exists $dictionary{$stemmed};
}

sub _extract_capitalized {
    my $text = shift;
    my @results;
    while ($text =~ /($phrase)/ms) {
        my $entity = $1;
        $text =~ s/($context?)\Q$entity\E($context?)/$2/;
        my ($pre, $post)= ($1, $2);
        while ($entity =~ s/^($conjunctives\s+)// or
               $entity =~ s/^(.+?)(Mrs?|Ms|Dr|Mt|Ft)/$2/) {
            $pre .= $1;
        }
        next if length $entity <2;
        push @results, { entity => $entity, pre => $pre, post => $post };
    }
    return @results;
}

sub _combine_contexts {
    my @entities = @_;
    my %combined;
    my @rv;
    # If something's a person in one sentence, it's likely to be one in
    # another too!
    for my $e (@entities) {
        $combined{$e->{entity}}{entity} = $e->{entity};
        for my $class (keys %{$e->{scores}}) {
            $combined{$e->{entity}}{scores}{$class} += $e->{scores}{$class}
        }
    }
    for my $e (values %combined) {
        push @rv, _fix_scores($e);
     }
    return @rv;
}


sub _find_file {
    my $file = shift;
    my @files = grep { -e $_ } map { "$_/Lingua/EN/NamedEntity/$file" } @INC;
    return $files[0];
}

1;
__END__
您需要登录后才可以回帖 登录 | 注册

本版积分规则 发表回复

  

北京盛拓优讯信息技术有限公司. 版权所有 京ICP备16024965号-6 北京市公安局海淀分局网监中心备案编号:11010802020122 niuxiaotong@pcpop.com 17352615567
未成年举报专区
中国互联网协会会员  联系我们:huangweiwei@itpub.net
感谢所有关心和支持过ChinaUnix的朋友们 转载本站内容请注明原作者名及出处

清除 Cookies - ChinaUnix - Archiver - WAP - TOP