Testing multilingual semantic text similarity

from mozuma.torch.runners import TorchInferenceRunner
from mozuma.torch.options import TorchRunnerOptions
from mozuma.models.sentences import torch_distiluse_base_multilingual_v2
from mozuma.torch.datasets import ListDataset
from mozuma.callbacks.memory import CollectFeaturesInMemory

import torch
import matplotlib.pyplot as plt
import seaborn as sns

sns.set(style="white")
%matplotlib inline

Create lists of sentences in multiple languages

Random sentences selected from https://tatoeba.org/en/

sentences1 = [
    "We were preparing food.",
    "Ni preparis manĝaĵon.",
    "Wij maakten eten klaar.",
    "Nous préparâmes de la nourriture.",
    "たちは食事の準備をした。",
    "Preparamos comida.",
]
sentences2 = [
    "Anĥizo interpretas la orakolon, kaj konvinkas la Trojanojn, ke temas pri la insulo Kreto, el kiu eliris unu el la unuatempaj fondintoj de Trojo.",
    "Anchise explique l'oracle, et persuade aux Troyens qu'il s'agit de l'île de Crète, d'où est sorti un des anciens fondateurs de Troie.",
    "Anquises interpreta o oráculo e convence os troianos de que se trata da ilha de Creta, da qual saiu um dos antigos fundadores de Troia.",
]
sentences3 = [
    "Mi pensas, ke mi devus foriri, ĉar jam estas malfrue.",
    "Je crois que je devrais partir car il se fait tard.",
    "I think I must be leaving since it is getting late.",
]

Define a ListDataset

dset = ListDataset(sentences1 + sentences2 + sentences3)

Extract sentence feature vectors

# define sentence embedding model
torch_device = torch.device("cpu")
model = torch_distiluse_base_multilingual_v2(device=torch_device)

# define callback for collecting features
sentence_features = CollectFeaturesInMemory()

# define the runner
runner = TorchInferenceRunner(
    dataset=dset,
    model=model,
    callbacks=[sentence_features],
    options=TorchRunnerOptions(
        data_loader_options={"batch_size": 1}, device=model.device, tqdm_enabled=True
    ),
)
runner.run()

Compute sentence similarities

cos_sim = sentence_features.features @ sentence_features.features.T
fig, ax = plt.subplots(figsize=(8, 6))
ax = sns.heatmap(cos_sim, cmap="PuRd", annot=True, fmt=".1")

We can see on the heatmap below that similar sentences in multiple languages have high cosine similarity scores, while other sentences have low similarities.