diff --git a/README.md b/README.md index 0ab6a8d..3cc2948 100644 --- a/README.md +++ b/README.md @@ -2,7 +2,7 @@ ChatCalledQuest module is a python module thought as Geografia's (an alternative Sign Language don't have an exact written counterpart since its simultaneous/non-linear gesture/facial expression structure, thus the adoption of gloss-level intermediate representation of the sign message (basically a simplified version of the spoken language) . -Current implementantion is simple and based on spacy python module, to see the usage open the jupyter-notebook ccq_usage.ipynb and run it yourself or have a look directly at ccq_usage.html +Current implementantion is simple and based on spacy python module, to see the usage open the jupyter-notebook usage.ipynb and run it yourself or have a look directly at usage.html -Everything under geografia_research sub folder is preliminary research, not really much manteined from a while yet possibly useful for the development of geografia project +Everything under the submodule georesearch sub folder is a collection of 2021 wonders and researches, not really much manteined from a while yet possibly useful for the development of geografia and ccq project diff --git a/usage.html b/usage.html new file mode 100644 index 0000000..42818d9 --- /dev/null +++ b/usage.html @@ -0,0 +1,15448 @@ + + +
+ + +The ccq module is thought as Geografia's backend engine and is aimed to translate from written spoken-italian to written LIS-glossed texts (comprensione) and viceversa (produzione).
+Sign Language don't have an exact written counterpart since its simultaneous/non-linear gesture/facial expression structure, thus the adoption of gloss-level intermediate representation of the sign message (basically a simplified version of the spoken language) .
+ +plain string as input
+ +import os
+os.environ['TOKENIZERS_PARALLELISM']='false'
+import spacy
+import spacy_transformers
+import ccq, config
+import importlib
+
importlib.reload(ccq)
+engine = ccq.CCQ()
+
affirmative_spoken = 'Luca va in Spagna la prossima estate'
+engine.translate(affirmative_spoken, direction='spoken2gloss')
+
(['prossimo', 'estate', 'luca', 'spagna', 'andare'], True, '')+
affirmative_spoken2 = 'Stasera voglio bere una birra'
+engine.translate(affirmative_spoken2, direction='spoken2gloss')
+
(['stasera', 'birra', 'volere', 'bere'], True, '')+
negative_spoken = 'Luca non va in Spagna la prossima estate'
+engine.translate(negative_spoken, direction='spoken2gloss')
+
(['prossimo', 'estate', 'luca', 'spagna', 'andare', 'no'], True, '')+
closed_interrogative_spoken = 'Luca va in Spagna la prossima estate?'
+engine.translate(closed_interrogative_spoken, direction='spoken2gloss')
+
(['prossimo', 'estate', 'spagna', 'andare', '?'], True, '')+
open_interrogative_spoken = 'Dove andrà Luca la prossima estate?'
+engine.translate(open_interrogative_spoken, direction='spoken2gloss')
+
(['prossimo', 'estate', 'luca', 'andere', 'dove', '?'], True, '')+
fail_spoken = open_interrogative_spoken*3
+engine.translate(fail_spoken, direction='spoken2gloss')
+
('', False, "failed ['max_tokens', 'valid_tokens']")+
fail_spoken = 'ciao Lucia, sei strana con questa punteggiatura.'
+engine.translate(fail_spoken, direction='spoken2gloss')
+
('', False, "failed ['punct']")+
fail_spoken = 'Luca, Antonio, Paolo e Marco vanno al mare'
+engine.translate(fail_spoken, direction='spoken2gloss')
+
('', False, "failed ['punct', 'subject']")+
fail_spoken = 'Luca, Paolo e Marco vanno al mare'
+engine.translate(fail_spoken, direction='spoken2gloss')
+
('', False, "failed ['subject']")+
In here it is assumed as input a list containing space/punct separated word tokens and their relative possible attribute.
+The current possible (and required if the corresponding token is present) attributes are:
+affirmative_gloss = [('prossimo','time'),('estate','time'),('luca','subject'),('spagna',''),('andare','verb')]
+
+engine.translate(affirmative_gloss, direction='gloss2spoken')
+
(['luca', 'andare', 'prossimo', 'estate', 'spagna'], True, '')+
engine.translate(closed_interrogative_spoken, direction='spoken2gloss')
+
(['prossimo', 'estate', 'spagna', 'andare', '?'], True, '')+
negative_gloss = [('prossimo','time'),('estate','time'),('luca','subject'),('spagna',''),('andare','verb'),('no','')]
+
+engine.translate(negative_gloss, direction='gloss2spoken')
+
(['luca', 'andare', 'prossimo', 'estate', 'spagna', 'no'], True, '')+
open_interrogative_gloss = [('prossimo','time'),('estate','time'),('luca','subject'),('andare','verb'),('dove',''),('?','')]
+
+engine.translate(open_interrogative_gloss, direction='gloss2spoken')
+
(['luca', 'dove', 'andare', 'prossimo', 'estate', '?'], True, '')+
closed_interrogative_gloss = [('prossimo','time'),('estate','time'),('spagna',''),('andare','verb'),('luca','subject'),('?','')]
+
+engine.translate(closed_interrogative_gloss, direction='gloss2spoken')
+
(['luca', 'andare', 'prossimo', 'estate', 'spagna', '?'], True, '')+
+
from transformers import pipeline
+classifier = pipeline("zero-shot-classification",model="Jiva/xlm-roberta-large-it-mnli", use_fast=True, multi_label=True)
+
# we will classify the following wikipedia entry about Sardinia"
+sequence_to_classify = "La Sardegna è una regione italiana a statuto speciale di 1 592 730 abitanti con capoluogo Cagliari, la cui denominazione bilingue utilizzata nella comunicazione ufficiale è Regione Autonoma della Sardegna / Regione Autònoma de Sardigna."
+# we can specify candidate labels in Italian:
+candidate_labels = ["geografia", "politica", "macchine", "cibo", "moda"]
+classifier(sequence_to_classify, candidate_labels)
+
{'sequence': 'La Sardegna è una regione italiana a statuto speciale di 1 592 730 abitanti con capoluogo Cagliari, la cui denominazione bilingue utilizzata nella comunicazione ufficiale è Regione Autonoma della Sardegna / Regione Autònoma de Sardigna.', + 'labels': ['geografia', 'macchine', 'politica', 'cibo', 'moda'], + 'scores': [0.38871291279792786, + 0.22633220255374908, + 0.1939833015203476, + 0.13735689222812653, + 0.13708347082138062]}+
extend & test failure management for gloss2spoken translation
+abbellify gloss2spoken : add articles, conj, verb declination
+the backend will be somehow linked to somekind of db storing image having with indexes glosses; The db will be most probably pretty limited in quantity and general in semantics terms (an image could have more than one index, and probably also an index could have more than one image) thus there will be a need for implementing also a word vector similarity engine (for synonims) and a zero-shot transformer (in order to exploit the contextual meaning of a sentence for the representantion of each word token)
+cleaner code
+https://github.com/janlukasschroeder/nlp-cheat-sheet-python
+spacy
+ +transformers
+italian zero-shot : https://huggingface.co/Jiva/xlm-roberta-large-it-mnli
+italian fill mask : https://huggingface.co/Musixmatch/umberto-wikipedia-uncased-v1
+stanford transformers--> stanza
+ +transformer + spacy : italian NER
+https://huggingface.co/bullmount/it_nerIta_trf
+pip install https://huggingface.co/bullmount/it_nerIta_trf/resolve/main/it_nerIta_trf-any-py3-none-any.whl
+
+