| | from fastapi import FastAPI, File, UploadFile, Form
|
| | from fastapi.responses import JSONResponse
|
| | import uvicorn
|
| | import tempfile
|
| | import nemo.collections.asr as nemo_asr
|
| | import re
|
| | import os
|
| | from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
|
| | import torch
|
| | from word2number import w2n
|
| | from deep_translator import GoogleTranslator
|
| |
|
| |
|
| | arabic_numbers = {
|
| |
|
| | "صفر": "0", "زيرو": "0", "٠": "0","زيو": "0","زير": "0","زر": "0","زروا": "0","زرا": "0","زيره ": "0","زرو ": "0",
|
| | "واحد": "1", "واحدة": "1", "١": "1",
|
| | "اتنين": "2", "اثنين": "2", "إثنين": "2", "اثنان": "2", "إثنان": "2", "٢": "2",
|
| | "تلاتة": "3", "ثلاثة": "3", "٣": "3",
|
| | "اربعة": "4", "أربعة": "4", "٤": "4",
|
| | "خمسة": "5", "٥": "5",
|
| | "ستة": "6", "٦": "6",
|
| | "سبعة": "7", "٧": "7",
|
| | "تمانية": "8", "ثمانية": "8", "٨": "8",
|
| | "تسعة": "9", "٩": "9",
|
| |
|
| |
|
| | "عشرة": "10", "١٠": "10",
|
| |
|
| | "احد عشر": "11", "واحد عشر": "11", "حداشر": "11",
|
| | "١ عشر": "11", "1 عشر": "11", "١عشر": "11", "1عشر": "11",
|
| | "١١": "11", "11": "11",
|
| |
|
| |
|
| | "اثنا عشر": "12", "اثني عشر": "12", "اتناشر": "12",
|
| | "٢ عشر": "12", "2 عشر": "12", "٢عشر": "12", "2عشر": "12",
|
| | "١٢": "12", "12": "12",
|
| |
|
| |
|
| | "ثلاثة عشر": "13", "تلاتة عشر": "13", "تلتاشر": "13",
|
| | "٣ عشر": "13", "3 عشر": "13", "٣عشر": "13", "3عشر": "13",
|
| | "١٣": "13", "13": "13",
|
| |
|
| |
|
| | "أربعة عشر": "14", "اربعة عشر": "14", "اربعتاشر": "14",
|
| | "٤ عشر": "14", "4 عشر": "14", "٤عشر": "14", "4عشر": "14",
|
| | "١٤": "14", "14": "14",
|
| |
|
| |
|
| | "خمسة عشر": "15", "خمسه عشر": "15", "خمستاشر": "15",
|
| | "٥ عشر": "15", "5 عشر": "15", "٥عشر": "15", "5عشر": "15",
|
| | "١٥": "15", "15": "15",
|
| |
|
| |
|
| | "ستة عشر": "16", "سته عشر": "16", "ستاشر": "16",
|
| | "٦ عشر": "16", "6 عشر": "16", "٦عشر": "16", "6عشر": "16",
|
| | "١٦": "16", "16": "16",
|
| |
|
| |
|
| | "سبعة عشر": "17", "سبعه عشر": "17", "سبعتاشر": "17",
|
| | "٧ عشر": "17", "7 عشر": "17", "٧عشر": "17", "7عشر": "17",
|
| | "١٧": "17", "17": "17",
|
| |
|
| |
|
| | "ثمانية عشر": "18", "تمانية عشر": "18", "طمنتاشر": "18",
|
| | "٨ عشر": "18", "8 عشر": "18", "٨عشر": "18", "8عشر": "18",
|
| | "١٨": "18", "18": "18",
|
| |
|
| |
|
| | "تسعة عشر": "19", "تسعه عشر": "19", "تسعتاشر": "19",
|
| | "٩ عشر": "19", "9 عشر": "19", "٩عشر": "19", "9عشر": "19",
|
| | "١٩": "19", "19": "19",
|
| |
|
| |
|
| | "عشرين": "20", "٢٠": "20",
|
| | "تلاتين": "30", "ثلاثين": "30", "٣٠": "30",
|
| | "اربعين": "40", "أربعين": "40", "٤٠": "40",
|
| | "خمسين": "50", "٥٠": "50",
|
| | "ستين": "60", "٦٠": "60",
|
| | "سبعين": "70", "٧٠": "70",
|
| | "تمانين": "80", "ثمانين": "80", "٨٠": "80","تمانون": "80","ثمانون": "80",
|
| | "تسعين": "90", "٩٠": "90",
|
| |
|
| |
|
| | "مية": "100", "مائة": "100", "مئة": "100", "١٠٠": "100",
|
| | "ميتين": "200", "مائتين": "200",
|
| | "تلاتمية": "300", "ثلاثمائة": "300",
|
| | "اربعمية": "400", "أربعمائة": "400",
|
| | "خمسمية": "500", "خمسمائة": "500",
|
| | "ستمية": "600", "ستمائة": "600",
|
| | "سبعمية": "700", "سبعمائة": "700",
|
| | "تمانمية": "800", "ثمانمائة": "800",
|
| | "تسعمية": "900", "تسعمائة": "900",
|
| |
|
| |
|
| | "ألف": "1000", "الف": "1000", "١٠٠٠": "1000",
|
| | "ألفين": "2000", "الفين": "2000",
|
| | "تلات تلاف": "3000", "ثلاثة آلاف": "3000",
|
| | "اربعة آلاف": "4000", "أربعة آلاف": "4000",
|
| | "خمسة آلاف": "5000",
|
| | "ستة آلاف": "6000",
|
| | "سبعة آلاف": "7000",
|
| | "تمانية آلاف": "8000", "ثمانية آلاف": "8000",
|
| | "تسعة آلاف": "9000",
|
| |
|
| |
|
| | "عشرة آلاف": "10000",
|
| | "مية ألف": "100000", "مائة ألف": "100000",
|
| | "مليون": "1000000", "١٠٠٠٠٠٠": "1000000",
|
| | "ملايين": "1000000",
|
| | "مليار": "1000000000", "١٠٠٠٠٠٠٠٠٠": "1000000000",
|
| |
|
| | "واحد وعشرون": "21", "1 وعشرون": "21",
|
| | "اثنان وعشرون": "22", "٢ وعشرون": "22",
|
| | "ثلاثة وعشرون": "23", "٣ وعشرون": "23",
|
| | "اربعة وعشرون": "24", "٤ وعشرون": "24",
|
| | "خمسة وعشرون": "25", "٥ وعشرون": "25",
|
| | "ستة وعشرون": "26", "٦ وعشرون": "26",
|
| | "سبعة وعشرون": "27", "٧ وعشرون": "27",
|
| | "تمانية وعشرون": "28", "ثمانية وعشرون": "28", "٨ وعشرون": "28",
|
| | "تسعة وعشرون": "29", "٩ وعشرون": "29",
|
| |
|
| | "ثمانية وثمانون": "88", "8 وثمانون": "88",
|
| | "اثنان وثمانون": "82", "٢ وثمانون": "82",
|
| | "خمسة وستون": "65", "5 وستون": "65",
|
| | "ستة عشر": "16", "٦ عشر": "16",
|
| | "اثنا عشر": "12", "١٢": "12",
|
| | "ثلاثة وثلاثون": "33", "٣٣": "33", "33": "33",
|
| | "أربعة وأربعون": "44", "٤٤": "44", "44": "44",
|
| | "خمسة وخمسون": "55", "٥٥": "55", "55": "55",
|
| | "ستة وستون": "66", "٦٦": "66", "66": "66",
|
| | "سبعة وسبعون": "77", "٧٧": "77", "77": "77",
|
| | "ثمانية وثمانون": "88", "٨٨": "88", "88": "88",
|
| | "تسعة وتسعون": "99", "٩٩": "99", "99": "99",
|
| | }
|
| |
|
| | def replace_arabic_numbers(text: str) -> str:
|
| | for word, digit in arabic_numbers.items():
|
| | text = re.sub(rf"\b{word}\b", digit, text)
|
| | return text
|
| |
|
| |
|
| |
|
| | app = FastAPI(title="Arabic ASR API", description="ASR API with NeMo and Arabic digit conversion")
|
| |
|
| |
|
| | @app.on_event("startup")
|
| | def load_model():
|
| | global asr_model
|
| | global model
|
| | global tokenizer
|
| | global device
|
| |
|
| | model_path = "C:/Users/thegh/Python_Projects/Expertflow/UnderProgress/Peter_Projects/NP_Detection_Nvidia_conformer/stt_ar_fastconformer_hybrid_large_pc_v1.0.nemo"
|
| | asr_model = nemo_asr.models.EncDecCTCModel.restore_from(model_path)
|
| |
|
| |
|
| | model_translator_name = "ukaAi/Egyptian_dialect_to_arabic"
|
| | tokenizer = AutoTokenizer.from_pretrained(model_translator_name)
|
| | model = AutoModelForSeq2SeqLM.from_pretrained(model_translator_name)
|
| |
|
| | device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
| | model = model.to(device)
|
| | def translate_egyptian_to_english(text: str) -> str:
|
| | """
|
| | Translates Egyptian Arabic text to English using the fine-tuned NLLB model.
|
| |
|
| | Parameters:
|
| | - text (str): The input Egyptian Arabic text
|
| |
|
| | Returns:
|
| | - str: The translated English text
|
| | """
|
| | tokenizer.src_lang = "arz_Arab"
|
| | forced_bos_token_id = tokenizer.convert_tokens_to_ids("eng_Latn")
|
| |
|
| | inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
|
| | inputs = {k: v.to(device) for k, v in inputs.items()}
|
| |
|
| | translated = model.generate(
|
| | **inputs,
|
| | forced_bos_token_id=forced_bos_token_id,
|
| | max_length=512,
|
| | num_beams=4,
|
| | early_stopping=True
|
| | )
|
| |
|
| | return tokenizer.decode(translated[0], skip_special_tokens=True)
|
| |
|
| | WORD_TO_NUM = {
|
| | "zero": 0, "one": 1, "two": 2, "three": 3, "four": 4, "five": 5,
|
| | "six": 6, "seven": 7, "eight": 8, "nine": 9, "ten": 10,
|
| | "eleven": 11, "twelve": 12, "thirteen": 13, "fourteen": 14,
|
| | "fifteen": 15, "sixteen": 16, "seventeen": 17, "eighteen": 18,
|
| | "nineteen": 19, "twenty": 20, "thirty": 30, "forty": 40,
|
| | "fifty": 50, "sixty": 60, "seventy": 70, "eighty": 80, "ninety": 90
|
| | }
|
| |
|
| |
|
| | ORDINAL_TO_NUM = {
|
| | "first": 1, "second": 2, "third": 3, "fourth": 4, "fifth": 5,
|
| | "sixth": 6, "seventh": 7, "eighth": 8, "ninth": 9, "tenth": 10,
|
| | "eleventh": 11, "twelfth": 12, "thirteenth": 13, "fourteenth": 14,
|
| | "fifteenth": 15, "sixteenth": 16, "seventeenth": 17, "eighteenth": 18,
|
| | "nineteenth": 19, "twentieth": 20, "thirtieth": 30, "fortieth": 40,
|
| | "fiftieth": 50, "sixtieth": 60, "seventieth": 70, "eightieth": 80, "ninetieth": 90
|
| | }
|
| |
|
| | def normalize_token(token: str):
|
| | """Convert a single token or hyphenated token into a number if possible."""
|
| | token = token.lower()
|
| |
|
| |
|
| | if token in ORDINAL_TO_NUM:
|
| | return ORDINAL_TO_NUM[token]
|
| |
|
| |
|
| | if "-" in token:
|
| | parts = token.split("-")
|
| | nums = [WORD_TO_NUM.get(p) for p in parts if p in WORD_TO_NUM]
|
| | if nums:
|
| | return sum(nums)
|
| |
|
| |
|
| | return WORD_TO_NUM.get(token)
|
| |
|
| |
|
| | def words_to_numbers(phrase: str):
|
| | tokens = phrase.lower().strip().split()
|
| | nums = [normalize_token(t) for t in tokens if normalize_token(t) is not None]
|
| |
|
| | if not nums:
|
| | return []
|
| |
|
| |
|
| | if len(nums) == 3:
|
| | return [int(f"{nums[0]}{nums[1]}") + nums[2]]
|
| |
|
| |
|
| | if len(nums) == 2:
|
| | if nums[1] >= 20:
|
| | return [nums[0] + nums[1]]
|
| | else:
|
| | return [int("".join(str(n) for n in nums))]
|
| |
|
| |
|
| | return nums
|
| |
|
| |
|
| | def parse_numbers(text: str):
|
| | chunks = re.split(r"[,\.;]", text)
|
| | result = []
|
| | for chunk in chunks:
|
| | result.extend(words_to_numbers(chunk))
|
| | return " ".join(str(n) for n in result)
|
| | @app.post("/transcribe")
|
| | async def transcribe_audio(file: UploadFile = File(...)):
|
| |
|
| | with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp:
|
| | tmp.write(await file.read())
|
| | tmp_path = tmp.name
|
| |
|
| | try:
|
| |
|
| | result = asr_model.transcribe([tmp_path])
|
| | print(result)
|
| | raw_text = result[0].text
|
| | print(raw_text)
|
| |
|
| | result = translate_egyptian_to_english(raw_text)
|
| |
|
| | print("\n=== English Translation ===\n")
|
| | print(result)
|
| |
|
| | print(parse_numbers(result))
|
| |
|
| |
|
| |
|
| |
|
| |
|
| |
|
| | return JSONResponse(content={"transcription": raw_text})
|
| |
|
| | finally:
|
| | os.remove(tmp_path)
|
| |
|
| |
|
| | @app.post("/transcribe-bytes")
|
| | async def transcribe_audio_bytes(audio_bytes: bytes = File(...)):
|
| | with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp:
|
| | tmp.write(audio_bytes)
|
| | tmp_path = tmp.name
|
| |
|
| | try:
|
| | result = asr_model.transcribe([tmp_path])
|
| | raw_text = result[0].text
|
| | cleaned_text = replace_arabic_numbers(raw_text)
|
| |
|
| | return JSONResponse(content={"transcription": cleaned_text})
|
| |
|
| | finally:
|
| | os.remove(tmp_path)
|
| |
|
| |
|
| | if __name__ == "__main__":
|
| | uvicorn.run(app, host="0.0.0.0", port=8000, reload=True)
|
| |
|