Testing multilingual semantic text similarity
from mozuma.torch.runners import TorchInferenceRunner
from mozuma.torch.options import TorchRunnerOptions
from mozuma.models.sentences import torch_distiluse_base_multilingual_v2
from mozuma.torch.datasets import ListDataset
from mozuma.callbacks.memory import CollectFeaturesInMemory
import torch
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style="white")
%matplotlib inline
- Create lists of sentences in multiple languages
Random sentences selected from https://tatoeba.org/en/
sentences1 = [
"We were preparing food.",
"Ni preparis manĝaĵon.",
"Wij maakten eten klaar.",
"Nous préparâmes de la nourriture.",
"たちは食事の準備をした。",
"Preparamos comida.",
]
sentences2 = [
"Anĥizo interpretas la orakolon, kaj konvinkas la Trojanojn, ke temas pri la insulo Kreto, el kiu eliris unu el la unuatempaj fondintoj de Trojo.",
"Anchise explique l'oracle, et persuade aux Troyens qu'il s'agit de l'île de Crète, d'où est sorti un des anciens fondateurs de Troie.",
"Anquises interpreta o oráculo e convence os troianos de que se trata da ilha de Creta, da qual saiu um dos antigos fundadores de Troia.",
]
sentences3 = [
"Mi pensas, ke mi devus foriri, ĉar jam estas malfrue.",
"Je crois que je devrais partir car il se fait tard.",
"I think I must be leaving since it is getting late.",
]
Define a ListDataset
- Extract sentence feature vectors
# define sentence embedding model
torch_device = torch.device("cpu")
model = torch_distiluse_base_multilingual_v2(device=torch_device)
# define callback for collecting features
sentence_features = CollectFeaturesInMemory()
# define the runner
runner = TorchInferenceRunner(
dataset=dset,
model=model,
callbacks=[sentence_features],
options=TorchRunnerOptions(
data_loader_options={"batch_size": 1}, device=model.device, tqdm_enabled=True
),
)
runner.run()
- Compute sentence similarities
cos_sim = sentence_features.features @ sentence_features.features.T
fig, ax = plt.subplots(figsize=(8, 6))
ax = sns.heatmap(cos_sim, cmap="PuRd", annot=True, fmt=".1")
We can see on the heatmap below that similar sentences in multiple languages have high cosine similarity scores, while other sentences have low similarities.