#!/usr/bin/python3 """Functions for handling lexc data.""" import re from typing import IO GLOBAL_EXCLUSIONS = ["CmpN/Only", "ShCmp", "Cmp/SplitR", " Rreal ", " R ", " RNoun ", " Rnoun ", " Rhyph ", "NOT-TO-LEMMATEST", "Use/Spell-", "SpellNoSugg", "\\+Pref"] def hidelexcescapes(s: str) -> str: """Encode lexc special characters differently. This function is designed to process a line or a block of lexc data including a single lexeme entry. But it'll work for any lexc snippet usually.""" s = s.replace("%!", "§EXCLAMATIONMARK§") s = s.replace("%:", "§COLON§") s = s.replace("%<", "§LESSTHAN§") s = s.replace("% ", "§SPACE§") s = s.replace("%0", "§ZERO§") if "<" in s and ">" in s: s = re.sub("<.*>", "§REGEX§", s) if "\"" in s: s = s.replace("%\"", "§QUOTATIONMARK§") if "\"" in s: # archaic translation comment s = re.sub("\".*\"", "", s) return s def unhidelexcescapes(s: str, unescape=True) -> str: """Restore encoded lexc special characters. Uses % escaping if unescape is False, otherwise characters are restored to actual surface form. """ if unescape: s = s.replace("§EXCLAMATIONMARK§", "!") s = s.replace("§COLON§", ":") s = s.replace("§LESSTHAN§", "<") s = s.replace("§SPACE§", " ") s = s.replace("§QUOTATIONMARK§", "\"") s = s.replace("§ZERO§", "0") else: s = s.replace("§EXCLAMATIONMARK§", "%!") s = s.replace("§COLON§", ":") s = s.replace("§LESSTHAN§", "<") s = s.replace("§SPACE§", " ") s = s.replace("§QUOTATIONMARK§", "%\"") s = s.replace("§ZERO§", "%0") return s def killflagdiacritics(s: str) -> str: """Remove flag diacritics from the string.""" if "@" in s: s = re.sub("@[CRDPNU].[^@]*@", "", s) return s def scrapelemmas(f: IO[str], exclusions: list[str], debug=False) -> set[str]: """Gets all lemmas from a lexc file.""" lemmas = set() for lexcline in f: if not lexcline or lexcline.strip() == "": continue excluded = False if exclusions: for exclusion in exclusions: if re.search(exclusion, lexcline): excluded = True if excluded: continue for exclusion in GLOBAL_EXCLUSIONS: if re.search(exclusion, lexcline): excluded = True if excluded: continue # preproc lexcline = hidelexcescapes(lexcline) if "!" in lexcline: lexcline = lexcline.split("!")[0] lexcline = lexcline.strip() lexcline = killflagdiacritics(lexcline) # see stuff if lexcline.startswith("LEXICON "): continue if not lexcline or lexcline == "": continue if ";" not in lexcline: continue if "+Err" in lexcline: continue if len(lexcline.split()) <= 2: continue if ":" in lexcline: analysis = unhidelexcescapes(lexcline.split(":")[0]) lemma = analysis.split("+")[0] if not lemma or lemma.strip() == "": continue if debug: print(lemma) lemmas.add(lemma) else: idstringy = unhidelexcescapes(lexcline.split()[0]) lemma = idstringy.split("+")[0] if not lemma or lemma.strip() == "": continue if debug: print(lemma) lemmas.add(lemma) return lemmas if __name__ == "__main__": pass