#!/bin/sh for i in $(ls sessions/*.xml) do cat $i \ | xmlstarlet sel --encode utf-8 --template \ --match "/session/chapter/turn/speaker[text[@language='en']/p[@type='speech'] and text[@language='fr']/p[@type='speech']]" \ --output "@@@" \ --copy-of "text[@language='en']/p" \ --output "@@@" \ --copy-of "text[@language='fr']/p" \ --output "@#@" \ | tr -d "\n" \ | sed -r -e "s/@#@/\n/g" \ -e "s/<\/p>
/|/g" \ -e "s/<\/p>@@@
/\t/g" \ -e "s/@@@
//g" \
-e "s/<\/p>//g" \
| grep -v "" \
| sed -r -e "s/<\/?\w+\/?>//g" -e "s/\|/ /g" \
| awk 'BEGIN{FS="\t"}{if((length($1) < 20) && (length($2) < 20)) print}'
done