|
|
|
use warnings; |
|
use strict; |
|
use utf8; |
|
use open qw(:std :utf8); |
|
|
|
use File::Copy; |
|
|
|
sub write_sentences { |
|
my ($target_file, @sentences) = @_; |
|
|
|
open (my $f, ">", $target_file) or die "Cannot open file $target_file: $!"; |
|
foreach my $sentence (@sentences) { |
|
print $f "$sentence\n"; |
|
} |
|
close $f; |
|
} |
|
|
|
|
|
@ARGV >= 2 or die "Usage: $0 target_directory lang <train_file\n"; |
|
my $target = shift @ARGV; |
|
my $lang = shift @ARGV; |
|
|
|
|
|
my ($sentence, @sentences) = (""); |
|
while (<STDIN>) { |
|
chomp; |
|
if (/^$/) { |
|
push @sentences, $sentence; |
|
$sentence = ""; |
|
} else { |
|
$sentence .= $_ . "\n"; |
|
} |
|
} |
|
die "Unfinished sentence" if $sentence; |
|
|
|
|
|
my $train_end = int(@sentences * 90 / 100); |
|
die "Zero sentences for dev_file" if $train_end == int(@sentences); |
|
|
|
write_sentences("$target/$lang-ud-train.conllu", @sentences[0..$train_end-1]); |
|
write_sentences("$target/$lang-ud-dev.conllu", @sentences[$train_end..$#sentences]); |
|
|