User Tools


Differences

This shows you the differences between two versions of the page.

Link to this comparison view

public:paste:spanish.sh [2014-07-10 20:15] – created Johannes Graënpublic:paste:spanish.sh [2023-09-15 20:33] (current) – external edit 127.0.0.1
Line 1: Line 1:
 +<file bash>
 +#!/bin/sh
  
 +pcorpus="ep_extract_5_1000000_unique2_10k.txt.gz"
 +
 +zcat $pcorpus \
 +| awk 'BEGIN{FS="\t"}{print $3}' \
 +| sed -r -e "s/^(.*)$/<t><s>\1<\/s><\/t>/" \
 +        -e "s/\|/<\/s><s>/g" \
 +        -e "s/’/'/g" \
 +| tree-tagger-spanish-utf8 \
 +| sed -r -e "s/<unknown>//" -e "s/^([^<].*)\t(.*)\t(.*)$/<e type=\"\1\" lemma=\"\3\" tag=\"\2\">/" \
 +| tr -d "\n" \
 +| sed -r -e "s/(<e type=\"[.?\!]\" lemma=\"[.?\!]\" tag=\"FS\">)(<[^\/])/\1<\/s><s>\2/g" \
 +        -e "s/(<e type=\":\" lemma=\":\" tag=\"COLON\">)(<[^\/])/\1<\/s><s>\2/g" \
 +        -e "s/(<e type=\";\" lemma=\";\" tag=\"SEMICOLON\">)(<[^\/])/\1<\/s><s>\2/g" \
 +        -e "s/<\/t>/\n/g" \
 +        -e "s/<t>//g" \
 +        -e "s/'/’/g" \
 +| gzip > sentences/spanish.sent.gz
 +</file>

CL Wiki

Institute of Computational Linguistics – University of Zurich