# -*- coding:utf-8 -*- # Copyright © 2020-2024 UiT The Arctic University of Norway # License: GPL3 # noqa: ERA001 # Author: Børre Gaup from pathlib import Path from typing import Any, Iterable from corpustools import ccat # type: ignore from lxml.etree import _Element, _ElementTree, parse from giellaltgramtools.common import COLORS from giellaltgramtools.corpus_gramchecker import CorpusGramChecker from giellaltgramtools.gramtest import GramTest from giellaltgramtools.normaloutput import NormalOutput from giellaltgramtools.testdata import TestData class CorpusGramTest(GramTest): def __init__(self, args: dict[str, str], ignore_typos: bool, targets: list[str]): super().__init__() self.targets = targets self.config: dict[str, Any] = { "out": NormalOutput(args), "ignore_typos": ignore_typos, "spec": Path(args.get("spec", "")), "variants": [args.get("variant")] if args.get("variant") else [], } if not args.get("colour"): for key in list(COLORS.keys()): COLORS[key] = "" def flatten_para(self, para: _Element) -> None: """Convert non-error xml elements into plain text.""" if not (para.tag.startswith("error") or para.tag == "correct"): text = para.text if para.text else "" if para.tail: text += para.tail parent = para.getparent() if parent is not None: parent.remove(para) if parent.text: parent.text = parent.text + text else: parent.text = text for child in para: self.flatten_para(child) def _add_text_to_previous(self, url: _Element, previous: _Element) -> None: """Add URL text and tail to previous element.""" if url.text is not None: if previous.tail is not None: previous.tail += url.text else: previous.tail = url.text if url.tail is not None: if previous.tail is not None: previous.tail += url.tail else: previous.tail = url.tail def _add_text_to_parent(self, url: _Element, parent: _Element) -> None: """Add URL text and tail to parent element.""" if url.text is not None: if parent.text is not None: parent.text += url.text else: parent.text = url.text if url.tail is not None: if parent.text is not None: parent.text += url.tail else: parent.text = url.tail def keep_url(self, root: _ElementTree) -> None: """Keep url as plain text.""" for url in root.xpath('.//errorlang[@correct="url"]'): parent = url.getparent() previous = url.getprevious() if previous is not None: self._add_text_to_previous(url, previous) else: self._add_text_to_parent(url, parent) parent.remove(url) def get_error_data(self, filename: str) -> Iterable[_Element]: root: _ElementTree = parse(filename) self.keep_url(root) for para in root.iter("p"): # the xml:lang attribute indicates that the sentence is not the expected # language. These paragraphs are not included in the test. if not para.get("{http://www.w3.org/XML/1998/namespace}lang"): self.flatten_para(para) yield para def make_test_results(self) -> Iterable[TestData]: grammarchecker = CorpusGramChecker(self.config) return ( test_data for filename in ccat.find_files(self.targets, ".xml") for test_data in grammarchecker.make_test_results( self.get_error_data(filename), filename=filename ) )