#!/bin/sh pcorpus="ep_extract_5_1000000_unique2_10k.txt.gz" zcat $pcorpus \ | awk 'BEGIN{FS="\t"}{print $3}' \ | sed -r -e "s/^(.*)$/<t><s>\1<\/s><\/t>/" \ -e "s/\|/<\/s><s>/g" \ -e "s/’/'/g" \ | tree-tagger-spanish-utf8 \ | sed -r -e "s/<unknown>//" -e "s/^([^<].*)\t(.*)\t(.*)$/<e type=\"\1\" lemma=\"\3\" tag=\"\2\">/" \ | tr -d "\n" \ | sed -r -e "s/(<e type=\"[.?\!]\" lemma=\"[.?\!]\" tag=\"FS\">)(<[^\/])/\1<\/s><s>\2/g" \ -e "s/(<e type=\":\" lemma=\":\" tag=\"COLON\">)(<[^\/])/\1<\/s><s>\2/g" \ -e "s/(<e type=\";\" lemma=\";\" tag=\"SEMICOLON\">)(<[^\/])/\1<\/s><s>\2/g" \ -e "s/<\/t>/\n/g" \ -e "s/<t>//g" \ -e "s/'/’/g" \ | gzip > sentences/spanish.sent.gz