#!/bin/sh for i in $(ls sessions/*.xml) do cat $i \ | xmlstarlet sel --encode utf-8 --template \ --match "/session/chapter/turn/speaker[text[@language='en']/p[@type='speech'] and text[@language='fr']/p[@type='speech']]" \ --output "@@@" \ --copy-of "text[@language='en']/p" \ --output "@@@" \ --copy-of "text[@language='fr']/p" \ --output "@#@" \ | tr -d "\n" \ | sed -r -e "s/@#@/\n/g" \ -e "s/<\/p>

/|/g" \ -e "s/<\/p>@@@

/\t/g" \ -e "s/@@@

//g" \ -e "s/<\/p>//g" \ | grep -v "" \ | sed -r -e "s/<\/?\w+\/?>//g" -e "s/\|/ /g" \ | awk 'BEGIN{FS="\t"}{if((length($1) < 20) && (length($2) < 20)) print}' done