In [1]:
import os
import math
import lzma

Download corpus via `wget -r -np https://pub.cl.uzh.ch/corpora/PaCoCo/Rumantsch_Grischun/`

In [3]:
PATH = 'pub.cl.uzh.ch/corpora/PaCoCo/Rumantsch_Grischun'
TOKEN_FILES = 'rumantsch-grischun.token.{}.tsv.xz'
SENTALIGN_FILES = 'rumantsch-grischun.sentence_alignment.{}.tsv.xz'
LANGUAGES = ['de', 'rm']

This function reads from compressed files directly:

In [4]:
def readxzfile(filename, language):
    """Reads in a compressed file in PaCoCo format (extended CoNLL-U) and returns rows as lists"""
    fh = lzma.open(filename.format(language), mode='rt', encoding='utf-8')
    for line in fh:
        if line == '\n' or line.startswith('#'):
            continue
        yield line.rstrip().split('\t')        

Read in sentence alignment units (relation sentenceID <=> sentenceAU):

In [6]:
sentAUs = {}
for language in LANGUAGES: 
    for au in readxzfile(os.path.join(PATH, SENTALIGN_FILES), language):
        sentAUs.setdefault(int(au[1]), {}).setdefault(language, []).append(int(au[0]))

Read in token lists (relation token <=> sentenceID):

In [7]:
tokens = {}
for language in LANGUAGES: 
    for au in readxzfile(os.path.join(PATH, TOKEN_FILES), language):
        tokens.setdefault(int(au[11]), []).append(au[1])

Let's have a look at an example:

In [8]:
sentAUs[1000]

{'de': [10039484], 'rm': [160039414]}

In [7]:
' '.join(tokens[10039484])

'^2 Jeder Indossant , der den Wechsel eingelöst hat , kann sein Indossament und die Indossamente seiner Nachmänner ausstreichen .'

In [9]:
' '.join(tokens[160039414])

'^2 Mintga indossant che ha pajà la cambiala po stritgar ses indossament e quels dals indossants posteriurs .'

We iterative over all tokens in both languages and calculate frequencies and a cooccurrence matrix:

In [11]:
tlist = {}
pairs = []
freq = {}
coo = {}
for sentAU in sentAUs:
    termlist = {}
    for language in LANGUAGES:
        tlist[language] = [t for id in sentAUs[sentAU][language] for t in tokens[id]]
        termlist[language] = set([w.lower() for w in tlist[language]])
        for w in termlist[language]:
            freq.setdefault(language, {}).setdefault(w, 0)
            freq[language][w] += 1
    # skip sentences with less than 3 tokens
    if len(tlist['de']) < 3 or len(tlist['rm']) < 3:
        continue
    # cooccurrence matrix
    for w1 in termlist['de']:
        for w2 in termlist['rm']:
            coo.setdefault(w1, {}).setdefault(w2, 0)
            coo[w1][w2] += 1
    pairs.append(tlist.copy())

The total number of tokens per language:

In [12]:
total = {}
for language in LANGUAGES:
    total[language] = sum(list(freq[language].values()))

In [13]:
total

{'de': 381932, 'rm': 454399}

The ten most frequent tokens per language (lowercased): 

In [14]:
sorted(freq['de'].items(), key=lambda item: item[1], reverse=True)[:10]

[('.', 14617),
 ('die', 11219),
 ('der', 10516),
 (',', 9728),
 ('art.', 7274),
 ('und', 7198),
 ('oder', 5171),
 ('des', 4759),
 ('^1', 3973),
 ('^2', 3961)]

In [15]:
sorted(freq['rm'].items(), key=lambda item: item[1], reverse=True)[:10]

[('da', 14640),
 ('.', 14517),
 ('la', 11573),
 (',', 8319),
 ('il', 7737),
 ("l'", 7668),
 ('art.', 7288),
 ('en', 6157),
 ('e', 6102),
 ('las', 5640)]

Ranking of the cooccurrences using mutual information (MI) score:

In [17]:
ranking = {}
for w1, w2list in coo.items():
    for w2, count in w2list.items():
        o = count
        e = freq['de'][w1] * freq['rm'][w2] / min(total.values())
        # skip any hapax legomenon
        if o == 1:
            continue
        # skip any negative association
        if o < e:
            continue
        # MI is calculated bitwise (base=2)
        score = math.log(o/e, 2) # MI
        ranking[(w1, w2)] = score

Pairs with the highest mutual information:

In [18]:
sorted(ranking.items(), key=lambda item: item[1], reverse=True)[:1000]

[(('allmächtigen', 'tutpussant'), 17.542956275013225),
 (('offenheit', 'avertadad'), 17.542956275013225),
 (('frieden', 'conscients'), 17.542956275013225),
 (('bestreben', 'conscients'), 17.542956275013225),
 (('zürich', 'turitg'), 17.542956275013225),
 (('wallis', 'vallais'), 17.542956275013225),
 (('5a', '5a'), 17.542956275013225),
 (('unmenschlicher', 'crudaivel'), 17.542956275013225),
 (('grausamer', 'crudaivel'), 17.542956275013225),
 (('gewissensfreiheit', 'conscienza'), 17.542956275013225),
 (('gewissensfreiheit', 'cretta'), 17.542956275013225),
 (('fernmeldetechnischen', 'telediffusiun'), 17.542956275013225),
 (('unverfälschte', 'nunfalsifitgada'), 17.542956275013225),
 (('bedrohungen', 'guerras'), 17.542956275013225),
 (('61a', '61a'), 17.542956275013225),
 (('abschlüssen', 'passadis'), 17.542956275013225),
 (('herkömmliche', 'tradiziunalas'), 17.542956275013225),
 (('stauanlagen', 'serra'), 17.542956275013225),
 (('niederschläge', 'precipitaziuns'), 17.542956275013225),
 (('s

Inspect the highest MI scores for «ergebnis»:

In [19]:
sorted({k: v for k, v in ranking.items() if k[0] == 'ergebnis'}.items(), key=lambda item: item[1], reverse=True)[:10]

[(('ergebnis', 'verifitgà'), 13.413673258068261),
 (('ergebnis', 'resumescha'), 13.413673258068261),
 (('ergebnis', 'resultat'), 12.323475449096682),
 (('ergebnis', 'rimnada'), 12.091745163180898),
 (('ergebnis', 'indirecta'), 11.413673258068261),
 (('ergebnis', 'falsifitgescha'), 11.298196040648325),
 (('ergebnis', 'constataziuns'), 10.32621041681792),
 (('ergebnis', 'total'), 9.954241639430963),
 (('ergebnis', 'cuntraproposta'), 9.954241639430963),
 (('ergebnis', 'suttametta'), 9.921820161738585)]

Inspect the highest MI scores for words starting with «viagiant»:

In [20]:
sorted({k: v for k, v in ranking.items() if k[1].startswith('viagiant')}.items(), key=lambda item: item[1], reverse=True)[:10]

[(('fahrenden', 'viagiants'), 17.542956275013225),
 (('fahrenden', 'viagiantas'), 16.95799377429207),
 (('ihrer', 'viagiantas'), 9.112503723347695),
 (('der', 'viagiantas'), 5.182657848395181),
 (('der', 'viagiants'), 5.182657848395181),
 ((',', 'viagiantas'), 4.7100662608484845),
 (('.', 'viagiantas'), 4.122634152483831)]