清华大佬耗费三个月吐血整理的几百G的资源,免费分享!....>>>
#!/usr/bin/perl
use strict;
#use warnings;
use encoding 'utf8';
use Data::Dumper;
sub dump_chars();
sub dump_sentenses();
sub dump_index();
sub recount_chars;
sub mark_chars;
sub dump_result;
sub count_unsed_chars;
sub count_weight;
# 读取字频表
my %chars;
open(CHAR_FILE, '<', 'lex-chars.lex');
while (my $line = <CHAR_FILE>) {
chomp($line);
my @items = split(/\//, $line);
print "bad line: $line\n" if (scalar(@items) != 3);
$chars{$items[0]} = $items[2];
}
close(CHAR_FILE);
# 排序,按字频输出
my @sorted_chars = sort { $chars{$b} <=> $chars{$a} } keys %chars;
#dump_chars();
# 筛选出频率前1000个字
my %top_chars;
for (my $i = 0; $i < 2000; $i++) {
$top_chars{$sorted_chars[$i]} = 1;
}
# 读取名句语料库
my %sentenses;
my %char_sentenses;
my %first_chars;
my %last_chars;
while (my $file = <r01*>) {
# print "reading $file\n";
open(FILE, '<', $file);
while (my $line = <FILE>) {
chomp($line);
$line =~ s/^\s+//g;
$line =~ s/\s+$//g;
next if (!$line);
$line =~ s/。$//;
$line =~ s/!$//;
$line =~ s/?$//;
$line =~ s/,$//;
next if ($line =~ /^—/);
next if (length($line) < 10);
next if (length($line) > 16);
next if ($line =~ /^(/);
next if ($line =~ /《/);
next if ($line =~ /[0-9A-Za-z]/);
next if ($line =~ /^,/);
next if ($line =~ /-/);
$sentenses{$line}++;
my $first_char = substr($line, 0, 1);
my $last_char = substr($line, length($line) - 1, 1);
$first_chars{$first_char} = 1;
$last_chars{$last_char} = 1;
}
close(FILE);
}
# 排序,按句频输出
my @sorted_sentenses = sort { $sentenses{$b} <=> $sentenses{$a} } keys %sentenses;
#dump_sentenses();
# 过滤只出现过一次的句子
#while ($sentenses{$sorted_sentenses[$#sorted_sentenses]} < 2) {
# pop(@sorted_sentenses);
#}
print 'total sentenses: ' . scalar(@sorted_sentenses) . "\n";
# 过滤末字无法接的句子
foreach my $sentense (keys(%sentenses)) {
my $last_char = substr($sentense, length($sentense) - 1, 1);
if (!$first_chars{$last_char}) {
#print "delete $sentense\n";
delete($sentenses{$sentense});
}
}
print 'total sentenses: ' . scalar(%sentenses) . "\n";
# 过滤首字无法接的句子
foreach my $sentense (keys(%sentenses)) {
my $first_char = substr($sentense, 0, 1);
if (!$last_chars{$first_char}) {
#print "delete $sentense\n";
delete($sentenses{$sentense});
}
}
print 'total sentenses: ' . scalar(%sentenses) . "\n";
foreach my $sentense (keys(%sentenses)) {
# 建立首字索引
my $first_char = substr($sentense, 0, 1);
if (!$char_sentenses{$first_char}) {
$char_sentenses{$first_char} = [];
}
push($char_sentenses{$first_char}, $sentense);
}
# 输出索引信息
#dump_index();
# 统计top字出现在各句子的次数
my %char_in_sentense;
foreach my $char (keys(%top_chars)) {
my $count = 0;
foreach my $sentense (@sorted_sentenses) {
$count++ if (index($sentense, $char) >= 0);
}
$char_in_sentense{$char} = $count;
}
my @sorted_char_in_sentenses = sort { $char_in_sentense{$a} <=> $char_in_sentense{$b} } keys %char_in_sentense;
#my $i = 0;
#foreach my $c (@sorted_char_in_sentenses) {
# $i++;
# print "$i:\t$c\t " . $char_in_sentense{$c} . "\n";
#}
# 以任意一个句子开头,接龙,首字不重复,直至不能接或首尾相连
my %used_first_chars;
my %used_sentenses;
my %used_chars;
my $char_count = 0;
my @sentense_queue;
my $max_char_count = 0;
my $max_depth = 0;
my $max_loop_depth = 0;
my $lastlog = time();
$| = 1; # disable IO buffer
sub traverse {
my ($sentense, $depth, $progress) = @_;
# 每分钟显示最新进度
if (time() - $lastlog > 60) {
print "sentense: $sentense, depth: $depth, count: $char_count, total progress: $progress\n";
$lastlog = time();
}
return if ($used_sentenses{$sentense});
$depth++;
$used_sentenses{$sentense} = 1; # 标记句子已使用
mark_chars($sentense);
push(@sentense_queue, $sentense);
if ($depth != scalar(@sentense_queue)) {
print "$sentense: $depth\n";
dump_result();
exit;
}
if ($depth > $max_depth) {
$max_depth = $depth;
print "new depth: $depth\n";
dump_result();
}
# 找到覆盖1000个字的方案(实际上100句职能覆盖500多个字)
if ($char_count >= 1000) {
if ($char_count >= 1000) {
print "found one: $depth\n";
dump_result();
exit;
}
}
if ($depth >= 100) {
if ($char_count > $max_char_count) {
$max_char_count = $char_count;
print "new record: $max_char_count\n";
dump_result();
}
#print "too deep: $char_count\n";
pop(@sentense_queue);
return;
}
my $last_char = substr($sentense, length($sentense) - 1, 1);
#print $last_char;
# 找到闭环
if ($last_char eq substr($sentense_queue[0], 0, 1)) {
if ($depth > $max_loop_depth) {
$max_loop_depth = $depth;
print "new loop: $depth\n";
dump_result();
}
pop(@sentense_queue);
return;
}
if ($char_sentenses{$last_char}) {
my %child_sentenses;
foreach my $s (@{$char_sentenses{$last_char}}) {
if (!$used_sentenses{$s}) {
my $count = count_weight($s);
if ($count > 0) {
$child_sentenses{$s} = $count;
}
}
}
my @sorted_children = sort { $child_sentenses{$b} <=> $child_sentenses{$a} } keys %child_sentenses;
my $i = 0;
my $len = scalar(@sorted_children);
foreach my $s (@sorted_children) {
$i++;
my $first_char = substr($s, 0, 1);
if (!$used_first_chars{$first_char}) {
$used_first_chars{$first_char} = 1;
traverse($s, $depth, $progress * ($len - $i + 1));
delete($used_sentenses{$s});
delete($used_first_chars{$first_char});
recount_chars();
}
}
pop(@sentense_queue);
} else {
pop(@sentense_queue);
#print "dead at $sentense: $depth; last char:$last_char\n";
}
}
# 挑选第一个句子
my %child_sentenses;
foreach my $s (keys(%sentenses)) {
my $count = count_weight($s);
if ($count > 0) {
$child_sentenses{$s} = $count;
}
}
my @sorted_children = sort { $child_sentenses{$b} <=> $child_sentenses{$a} } keys %child_sentenses;
my $i = 0;
my $len = scalar(@sorted_children);
foreach my $s (@sorted_children) {
$i++;
my $first_char = substr($s, 0, 1);
$used_first_chars{$first_char} = 1;
traverse($s, 0, $len - $i + 1);
delete($used_sentenses{$s});
delete($used_first_chars{$first_char});
recount_chars();
print "depth: 0, count: $char_count, total progress: " . ($len - $i + 1) .
", progress: $i/$len, weight: " . $child_sentenses{$s} . "\n";
}
sub dump_chars() {
my $i = 0;
foreach my $char (@sorted_chars) {
$i++;
print "$i\t $char\t $chars{$char}\n";
}
}
sub dump_sentenses() {
my $i = 0;
foreach my $sentense (@sorted_sentenses) {
$i++;
print "$i\t $sentense\t $sentenses{$sentense}\n";
}
}
sub dump_index() {
foreach my $char (keys(%char_sentenses)) {
print "$char: " . scalar(@{$char_sentenses{$char}}) . "\n";
}
}
sub mark_chars {
my $sentense = shift;
for (my $i = 0, my $len = length($sentense); $i < $len; $i++) {
my $char = substr($sentense, $i, 1);
if ($top_chars{$char} && !$used_chars{$char}) {
$char_count++;
}
$used_chars{$char} = 1;
}
}
sub count_weight {
my $sentense = shift;
my $weight = 0;
for (my $i = 0, my $len = length($sentense); $i < $len; $i++) {
my $char = substr($sentense, $i, 1);
if ($top_chars{$char} && !$used_chars{$char}) {
if ($char_in_sentense{$char}) {
$weight += 1 / $char_in_sentense{$char};
}
}
}
return $weight;
}
sub count_unsed_chars {
my $sentense = shift;
my $count = 0;
for (my $i = 0, my $len = length($sentense); $i < $len; $i++) {
my $char = substr($sentense, $i, 1);
if ($top_chars{$char} && !$used_chars{$char}) {
$count++;
}
}
return $count;
}
sub recount_chars {
# 清空计算器
undef %used_chars;
$char_count = 0;
foreach my $sentense (keys(%used_sentenses)) {
mark_chars($sentense);
}
}
sub dump_result {
print "chars: " . keys(%used_chars) . "\n";
print "char_count: $char_count\n";
my $i = 0;
foreach my $sentense (@sentense_queue) {
$i++;
print "$i: $sentense\n";
}
}