import heapq import logging import re import time from datetime import datetime from tornado import gen import tornado.iostream import asyncio from apertium_apy import missing_freqs_db # noqa: F401 from apertium_apy.handlers.base import BaseHandler from apertium_apy.keys import ApiKeys from apertium_apy.utils import to_alpha3_code, scale_mt_log from apertium_apy.utils.translation import parse_mode_file, make_pipeline # Typing imports that flake8 doesn't understand: from apertium_apy.utils.translation import FlushingPipeline, SimplePipeline # noqa: F401 from typing import Union # noqa: F401 class TranslationInfo: def __init__(self, handler): self.langpair = handler.get_argument('langpair') self.key = handler.get_argument('key', default='null') self.ip = handler.request.headers.get('X-Real-IP', handler.request.remote_ip) self.referer = handler.request.headers.get('Referer', 'null') class TranslateHandler(BaseHandler): unknown_mark_re = re.compile(r'[*]([^.,;:\t\* ]+)') api_keys = None def __init__(self, application, request, **kwargs): super().__init__(application, request, **kwargs) @property def mark_unknown(self): return self.get_argument('markUnknown', default='yes').lower() in ['yes', 'true', '1'] def note_pair_usage(self, pair): self.stats.usecount[pair] = 1 + self.stats.usecount.get(pair, 0) def maybe_strip_marks(self, mark_unknown, pair, translated): self.note_unknown_tokens('%s-%s' % pair, translated) if mark_unknown: return translated else: return re.sub(self.unknown_mark_re, r'\1', translated) def note_unknown_tokens(self, pair, text): global missing_freqs_db if missing_freqs_db is not None: for token in re.findall(self.unknown_mark_re, text): missing_freqs_db.note_unknown(token, pair) def cleanable(self, i, pair, pipe): if pipe.stuck: logging.info('A pipe for pair %s-%s seems stuck, scheduling restart', pair[0], pair[1]) return True if pipe.use_count > self.restart_pipe_after: # Not affected by min_pipes_per_pair logging.info('A pipe for pair %s-%s has handled %d requests, scheduling restart', pair[0], pair[1], self.restart_pipe_after) return True elif (i >= self.min_pipes_per_pair and self.max_idle_secs != 0 and time.time() - pipe.last_usage > self.max_idle_secs): logging.info("A pipe for pair %s-%s hasn't been used in %d secs, scheduling shutdown", pair[0], pair[1], self.max_idle_secs) return True else: return False def clean_pairs(self): for pair in self.pipelines: pipes = self.pipelines[pair] to_clean = set(p for i, p in enumerate(pipes) if self.cleanable(i, pair, p)) self.pipelines_holding += to_clean pipes[:] = [p for p in pipes if p not in to_clean] heapq.heapify(pipes) # The holding area lets us restart pipes after n usages next # time round, since with lots of traffic an active pipe may # never reach 0 users self.pipelines_holding[:] = [p for p in self.pipelines_holding if p.users > 0] if self.pipelines_holding: logging.info('%d pipelines still scheduled for shutdown', len(self.pipelines_holding)) def get_pipe_cmds(self, l1, l2): if (l1, l2) not in self.pipeline_cmds: mode_path = self.pairs['%s-%s' % (l1, l2)] self.pipeline_cmds[(l1, l2)] = parse_mode_file(mode_path) return self.pipeline_cmds[(l1, l2)] def should_start_pipe(self, l1, l2): pipes = self.pipelines.get((l1, l2), []) if pipes == []: logging.info('%s-%s not in pipelines of this process', l1, l2) return True else: min_p = pipes[0] if len(pipes) < self.max_pipes_per_pair and min_p.users > self.max_users_per_pipe: logging.info('%s-%s has ≥%d users per pipe but only %d pipes', l1, l2, min_p.users, len(pipes)) return True else: return False def get_pipeline(self, pair): (l1, l2) = pair if self.should_start_pipe(l1, l2): logging.info('Starting up a new pipeline for %s-%s …', l1, l2) if pair not in self.pipelines: self.pipelines[pair] = [] p = make_pipeline(self.get_pipe_cmds(l1, l2), self.timeout) heapq.heappush(self.pipelines[pair], p) return self.pipelines[pair][0] def log_before_translation(self): return datetime.now() def log_after_translation(self, before, length): after = datetime.now() if self.scale_mt_logs: t_info = TranslationInfo(self) key = self.get_api_key(t_info.key) scale_mt_log(self.get_status(), after - before, t_info, key, length) if self.get_status() == 200: timings = self.stats.timing oldest = timings[0][0] if timings else datetime.now() if datetime.now() - oldest > self.stat_period_max_age: self.stats.timing.pop(0) self.stats.timing.append( (before, after, length)) def get_pair_or_error(self, langpair, text_length): try: l1, l2 = map(to_alpha3_code, langpair.split('|')) in_mode = '%s-%s' % (l1, l2) except ValueError: self.send_error(400, explanation='That pair is invalid, use e.g. eng|spa') self.log_after_translation(self.log_before_translation(), text_length) return None in_mode = self.find_fallback_mode(in_mode, self.pairs) if in_mode not in self.pairs: self.send_error(400, explanation='That pair is not installed') self.log_after_translation(self.log_before_translation(), text_length) return None else: return tuple(in_mode.split('-')) def get_format(self): dereformat = self.get_argument('format', default=None) deformat = '' reformat = '' if dereformat: deformat = 'apertium-des' + dereformat reformat = 'apertium-re' + dereformat else: deformat = self.get_argument('deformat', default='html') if 'apertium-des' not in deformat: deformat = 'apertium-des' + deformat reformat = self.get_argument('reformat', default='html-noent') if 'apertium-re' not in reformat: reformat = 'apertium-re' + reformat return deformat, reformat @gen.coroutine def translate_and_respond(self, pair, pipeline, to_translate, mark_unknown, nosplit=False, deformat=True, reformat=True, prefs=''): mark_unknown = mark_unknown in ['yes', 'true', '1'] self.note_pair_usage(pair) before = self.log_before_translation() try: translated = yield pipeline.translate(to_translate, nosplit, deformat, reformat, prefs) self.log_after_translation(before, len(to_translate)) self.send_response({ 'responseData': { 'translatedText': self.maybe_strip_marks(mark_unknown, pair, translated), }, 'responseDetails': None, 'responseStatus': 200, }) except asyncio.TimeoutError as e: logging.warning('Translation error in pair %s-%s: %s', pair[0], pair[1], e) pipeline.stuck = True self.send_error(503, explanation='internal error') except tornado.iostream.StreamClosedError as e: logging.warning('Translation error in pair %s-%s: %s', pair[0], pair[1], e) pipeline.stuck = True self.send_error(503, explanation='internal error') self.clean_pairs() @gen.coroutine def get(self): pair = self.get_pair_or_error(self.get_argument('langpair'), len(self.get_argument('q'))) if pair is not None: pipeline = self.get_pipeline(pair) # type: Union[FlushingPipeline, SimplePipeline] deformat, reformat = self.get_format() yield self.translate_and_respond(pair, pipeline, self.get_argument('q'), self.get_argument('markUnknown', default='yes'), nosplit=False, deformat=deformat, reformat=reformat, prefs=self.get_argument('prefs', default=''), ) @classmethod def get_api_key(cls, key): if not cls.api_keys: cls.api_keys = ApiKeys(cls.api_keys_conf) return cls.api_keys.get_key(key) class PairPrefsHandler(BaseHandler): @gen.coroutine def get(self): self.send_response(self.pairprefs)