# # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this file. If not, see . # # Copyright © 2011-2025 The University of Tromsø & # the Norwegian Sámi Parliament # http://giellatekno.uit.no & http://divvun.no # """Classes and functions to sentence align two files.""" import argparse import os from functools import lru_cache from pathlib import Path from lxml import etree from python_tca2.alignmentmodel import AlignmentModel from python_tca2.anchorwordlist import AnchorWordList from python_tca2.tmx import make_tmx from corpustools import ( argparse_version, corpuspath, generate_anchor_list, sentencedivider, ) HERE = os.path.dirname(__file__) DICTS: dict[str, str] = {} def setup_anchors( lang1: str, lang2: str ) -> generate_anchor_list.GenerateAnchorList | None: """Setup anchor file. Args: lang1 (str): language 1 lang2 (str): language 2 Returns: (generate_anchor_list.GenerateAnchorList): The anchor list """ path1 = os.path.join( os.environ["GTHOME"], f"gt/common/src/anchor-{lang1}-{lang2}.txt", ) if os.path.exists(path1): return generate_anchor_list.GenerateAnchorList( lang1, lang2, [lang1, lang2], path1 ) path2 = os.path.join( os.environ["GTHOME"], f"gt/common/src/anchor-{lang2}-{lang1}.txt", ) if os.path.exists(path2): return generate_anchor_list.GenerateAnchorList( lang1, lang2, [lang2, lang1], path2 ) return None @lru_cache def make_dict(lang1: str, lang2: str) -> str: name = Path(f"/tmp/anchor-{lang1}-{lang2}.txt") gal = setup_anchors(lang1, lang2) if gal is not None: gal.generate_file(name.as_posix()) else: name.write_text("fake1 / fake2\n") return name.as_posix() def parallelise_file( source_lang_file: corpuspath.CorpusPath, para_lang_file: corpuspath.CorpusPath, anchor_file: str | None = None, ): """Align sentences of two parallel files.""" anchor_word_list = AnchorWordList() if anchor_file is not None: anchor_word_list.load_from_file(anchor_file) aligner = AlignmentModel( sentences_tuple=( sentencedivider.make_valid_sentences(source_lang_file), sentencedivider.make_valid_sentences(para_lang_file), ), anchor_word_list=anchor_word_list, ) aligned = aligner.suggest_without_gui() aligned_sentences = aligned.non_empty_pairs() tmx_result = make_tmx( file1_name=source_lang_file.orig.name, language_pair=(source_lang_file.lang, para_lang_file.lang), aligned_text_pairs=aligned_sentences, ) tmx_path = source_lang_file.tmx(para_lang_file.lang) tmx_path.write_bytes( etree.tostring( tmx_result, pretty_print=True, encoding="utf-8", ) ) print(f"TMX file created: {tmx_path}") def is_translated_from_lang2(path: corpuspath.CorpusPath, lang2: str) -> bool: """Find out if the given doc is translated from lang2.""" translated_from = path.metadata.get_variable("translated_from") if translated_from is None: return False return translated_from == lang2 def get_dictionary(lang1: str, lang2: str) -> str: if DICTS.get(f"{lang1}{lang2}") is None: DICTS[f"{lang1}{lang2}"] = make_dict(lang1, lang2) return DICTS[f"{lang1}{lang2}"] def get_filepair( orig_path: corpuspath.CorpusPath, para_lang: str ) -> tuple[corpuspath.CorpusPath, corpuspath.CorpusPath]: if is_translated_from_lang2(orig_path, para_lang): para_path = orig_path source_path = corpuspath.make_corpus_path( para_path.parallel(para_path.lang).as_posix() ) else: source_path = orig_path para_path = corpuspath.make_corpus_path( source_path.parallel(para_lang).as_posix() ) return para_path, source_path def parse_options(): """Parse the commandline options. Returns: (argparse.Namespace): the parsed commandline arguments """ parser = argparse.ArgumentParser( parents=[argparse_version.parser], description="Sentence align file pairs." ) parser.add_argument( "sources", nargs="+", help="Files or directories to search for parallelisable files", ) parser.add_argument( "-d", "--dict", default=None, help="Use a different bilingual seed dictionary. " "Must have two columns, with input_file language " "first, and --parallel_language second, separated " "by `/'. By default, python_tca2 files are used, but these " "files only supports pairings between " "sme/sma/smj/fin/eng/nob. ", ) parser.add_argument( "-l2", "--lang2", help="Indicate which language the given file should be parallelised with", required=True, ) args = parser.parse_args() return args def main(): """Parallelise files.""" args = parse_options() for path in corpuspath.collect_files(args.sources, suffix=".xml"): orig_corpuspath = corpuspath.make_corpus_path(path.as_posix()) if orig_corpuspath.lang == args.lang2: raise SystemExit( "Error: change the value of the -l2 option.\n" f"The -l2 value ({args.lang2}) cannot be the same as the " f"language as the source documents ({orig_corpuspath.lang})" ) try: para_path, source_path = get_filepair(orig_corpuspath, args.lang2) except TypeError: continue try: parallelise_file( source_path, para_path, anchor_file=( get_dictionary(lang1=source_path.lang, lang2=para_path.lang) if args.dict is None else args.dict ), ) except (OSError, UserWarning) as error: print(str(error)) except util.ArgumentError as error: raise SystemExit( f"{error}\nMore info here: " "https://divvun.github.io/CorpusTools/scripts/parallelize/#compile-dependencies", ) from error