Replacment rules

Global

First run

/\xad/u -> ""
/\xa0/u -> " "
/–/ -> "-"
/ʼ/ -> "'"
/č/ -> "č"
/&/ -> "&"
/\(\d+\)\.$/ -> "."
/\.\(\d+\)$/ -> "."
/^\s*\(\d+\) / -> ""
/\((\w+)$/ -> "(\1)"
/<[.]{3}|…>/ -> "<ellipsis />"
/[.]{3}|…/ -> "<ellipsis />"
/\(\)/ -> ""
/\($/ -> ""
/^<$/ -> ""  
/(\d+º)/ -> "<ord>\1</ord>"  
/(http:\/\/[a-z.-]+)"?\s*(\.| \w)/ -> "<url>\1</url>\2"  
/\((A\d-\d{4}\/(?:\d{2})?\d{2})\)/ -> "<report>\1</report>"
/(?<!>)(A\d-\d{4}(?:\/(?:\d{2})?\d{2})?)/ -> "<report>\1</report>"
/\[(\d{4}\/\d{4}\([A-Z]+\))\]/ -> "<procedure>\1</procedure>"  
/\(((?:[A-Z]+\(\d+\))?[A-Z\d\/ ]*\d{4} - C\d-\d{4})[7\/]((?:\d{2})?\d{2}(?: - \d{4}\/\d{4}\/?(?:\([A-Z]+\))?)?)\)/ -> "<ref>\1/\2</ref>"
/ ,/ -> ','
/(\([^)]+)$/ -> '\1)'
/\s\s+/ -> ' '

Second run

/" ?([^"”]+)["”]/u -> "<quote>\1</quote>"
/« ?([^»]+?) ?»/u -> "<quote>\1</quote>"
/(?<=>) ([:;.!?])/ -> "\1"
/<\/quote> ,/ -> "</quote>,"
/^"\s+/ -> ""
/^\)\s+/ -> ""
/&/ -> "&amp;"
/,,+/ -> ","

Local (language specific)

First run

Danish

/[»]([^«]+)«/u -> "<quote>\1</quote>"

German

/„([^“]+)(["”“])([^„]+„[^\2]+\2)*/u -> "<quote>\1</quote>"
/(?:,,)([\w ]+)"/u -> "<quote>\1</quote>"

English

/\' ?s\b/u -> "<gen>’s</gen>"
/["]([^\']+)\'/u -> "<quote>\1</quote>"

Spanish

/ -(?=\w)/u -> " —"
/(?<=\w)-(\W)/u -> "—\1"

Greek

/\' \b/u -> "’ "

Finish

/\b(\p{Lu}+): ?n\b/u -> "\1<gen>:n</gen>"

French

/\b(l|d|n|j|t|m|qu|c|s|jusqu|lorsqu|aujourd|puisqu|quelqu|quoiqu)\' ?/iu -> "\1"
/»([^»]+?)»/u -> "<quote>\1</quote>"

Italian

/\be[\'’]/u -> "è"
/\bE[\'’]/u -> "È"
/\bpò\b/u -> "po’"
/\bpo\'\b/u -> "po’"
/\bperchè\b/u -> "perché"
/\bpoichè\b/u -> "poiché"
/\baffinchè\b/u -> "affinché"
/\bpero\b/u -> "però"
/\b(un|l|d|dell|nell|all|dall|sull)\' ?/iu -> "\1’"
/[“"]([^”]+)”/u -> "<quote>\1</quote>"
/["]([^»]+)»/u -> "<quote>\1</quote>"
/»([^»]+?)»/u -> "<quote>\1</quote>"

Portuguese

/["]([^»]+)»/u -> "<quote>\1</quote>"

Swedish

/\b(\p{Lu}+): ?s\b/u -> "\1<gen>:s</gen>"

Second run

Bulgarian

/<quote>/ -> "<quote start="„" end="“">"

Czech

/<quote>/ -> "<quote start="„" end="“">"

Danish

/<quote>/ -> "<quote start="»" end="«">"

German

/<quote>/ -> "<quote start="„" end="“">"
/<\/quote>(\w)/ -> "</quote>-\1"

Greek

/<quote>/ -> "<quote start="«" end="»">"

English

/<quote>/ -> "<quote start="‘" end="’">"

Spanish

/<quote>/ -> "<quote start="«" end="»">"

Estonian

/<quote>/ -> "<quote start="„" end="”">"

Finnish

/<quote>/ -> "<quote start="”" end="”">"

French

/(?<! |&amp|&[lg]t)([?!:;])/u -> " \1"
/<quote>/ -> "<quote start="« " end=" »">"

Hungarian

/<quote>/ -> "<quote start="„" end="”">"

Italian

/<quote>/ -> "<quote start="«" end="»">"

Lithuanian

/<quote>/ -> "<quote start="„" end="“">"

Latvian

/<quote>/ -> "<quote start="„" end="“">"

Dutch

/<quote>/ -> "<quote start="“" end="”">"

Polish

/<quote>/ -> "<quote start="„" end="”">"

Portuguese

/<quote>/ -> "<quote start="«" end="»">"

Romanian

/<quote>/ -> "<quote start="„" end="”">"

Slovak

/<quote>/ -> "<quote start="„" end="“">"

Slovenian

/<quote>/ -> "<quote start="„" end="“">"

Swedish

/<quote>/ -> "<quote start="”" end="”">"

CL Wiki

Institute of Computational Linguistics – University of Zurich