CL Wiki

Institute of Computational Linguistics – University of Zurich

User Tools

Site Tools


public:paste:spanish.sh
#!/bin/sh
 
pcorpus="ep_extract_5_1000000_unique2_10k.txt.gz"
 
zcat $pcorpus \
| awk 'BEGIN{FS="\t"}{print $3}' \
| sed -r -e "s/^(.*)$/<t><s>\1<\/s><\/t>/" \
        -e "s/\|/<\/s><s>/g" \
        -e "s/’/'/g" \
| tree-tagger-spanish-utf8 \
| sed -r -e "s/<unknown>//" -e "s/^([^<].*)\t(.*)\t(.*)$/<e type=\"\1\" lemma=\"\3\" tag=\"\2\">/" \
| tr -d "\n" \
| sed -r -e "s/(<e type=\"[.?\!]\" lemma=\"[.?\!]\" tag=\"FS\">)(<[^\/])/\1<\/s><s>\2/g" \
        -e "s/(<e type=\":\" lemma=\":\" tag=\"COLON\">)(<[^\/])/\1<\/s><s>\2/g" \
        -e "s/(<e type=\";\" lemma=\";\" tag=\"SEMICOLON\">)(<[^\/])/\1<\/s><s>\2/g" \
        -e "s/<\/t>/\n/g" \
        -e "s/<t>//g" \
        -e "s/'/’/g" \
| gzip > sentences/spanish.sent.gz
public/paste/spanish.sh.txt · Last modified: 2023-09-15 20:33 by 127.0.0.1

Donate Powered by PHP Valid HTML5 Valid CSS Driven by DokuWiki