Source code for urdubiometer.scanner.scanner

# -*- coding: utf-8 -*-
"""
Base class for metrical scanner.

    Base class for metrical scanner that is extended to support different
    metrical paradigms.

    Notes
    -----
    Scanner uses a best-first search. Nodes are
    of type start/root ("Start"), metrical unit ("=" for long, "-" for short,
     and "_" for an optional, uncounted short followed by a wordbreak), or
    accepting ("Accepting").

"""
# add specific examples
import itertools
from collections import deque

# import logging
# logging.basicConfig(level=logging.CRITICAL)
# logger = logging.getLogger(__name__)

from .types import NodeMatch, ScanIteration, ScanResult, UnitMatch
from .validate import validate_parsers, validate_meters_list, validate_constraints
from .initialize import _meters_graph_of, _constrained_parsers_of


[docs]class Scanner:
    """
    Scanner class.

    Parameters
    ----------
    transcription_parser : graphtransliterator.GraphTransliterator
        Transcription parser.
    long_parser : graphtransliterator.GraphTransliterator
        Long metrical unit parser.
    short_parser : graphtransliterator.GraphTransliterator
        Short metrical unit parser.
    constraints : dict(str, dict(str, dict(str,list[str])))
        Nested dict of constraints, organized by previous node, next node,
        previous production and finally a list of next productions,  e.g.
        ``{'-':{'-': 's_bs':['s_c']}}``.
    meters_list : list[dict]
        A list of dictionaries of meters, containing a meter regex and details.
    find_feet : function
        Method to add metrical feet to a scan
    post_scan_filter : function
        Filter to be applied after scan, used to narrow results.

    """

    def __init__(
        self,
        transcription_parser,
        long_parser,
        short_parser,
        constraints,
        meters_list,
        find_feet=None,
        post_scan_filter=None,
    ):

        validate_parsers(transcription_parser, long_parser, short_parser)

        validate_constraints(
            constraints, long_parser.productions, short_parser.productions
        )

        validate_meters_list(meters_list)

        self._transcription_parser = transcription_parser
        self._long_parser = long_parser
        self._short_parser = short_parser
        self._find_feet = find_feet
        self._post_scan_filter = post_scan_filter
        self._constraints = constraints
        self._constrained_parsers = _constrained_parsers_of(
            constraints, long_parser, short_parser
        )

        self._translation_graph = _meters_graph_of(meters_list)
        self._meters_list = meters_list

[docs]    def transcribe(self, input):
        """Transcribe input using transcription parser.

        Parameters
        ----------
        input: str
            Input string

        Returns
        -------
            str
                Transcription of input string
        """

        return self._transcription_parser.transliterate(input)

[docs]    def scan(self, input, first_only=False, graph_details=False, show_feet=False):
        """
        Scan input.

        Parameters
        ----------
        input: str
            Input string
        first_only: bool
            Return the first scan only
        graph_details: bool
            Return the graph details (list of :class:`NodeMatch`)
        show_feet: bool
            Show metrical feet in scan. Default is `False`.

        Returns
        -------
        list or None
            if graph_details is False, a list of UnitMatch.
            if graph_details is True, a list of NodeMatch.
            None if no complete scans are found.
        """

        def special_parser():
            """Determine if there is a constrained parser to be used."""

            if not self._constrained_parsers or len(matches) == 0:
                return None
            parent_type = graph.node[parent_key]["type"]
            try:
                if parent_type == "_" and node_type == "=":
                    parser = self._constrained_parsers[parent_type][node_type]["*"]
                else:
                    parser = self._constrained_parsers[parent_type][node_type][
                        matches[-1].rule_found
                    ]
                return parser
            except KeyError:
                return None

        graph = self._translation_graph
        assert graph is not None
        parse = self._transcription_parser.transliterate(input)
        # find original tokens, and add whitespace.
        transcription_tokens = (
            [[self._transcription_parser._whitespace.default]]
            + self._transcription_parser.last_matched_rule_tokens
            + [[self._transcription_parser._whitespace.default]]
        )

        # logger.debug("Parse for input %s is: %s" % (input, parse))
        tokens = self._long_parser.tokenize(parse)

        # logger.debug("Tokens for input %s are: %s" % (input, tokens))
        completed_scans = []
        stack = deque()
        for _ in graph.edge[0]:  # <--- could add weights here
            stack.appendleft(
                ScanIteration(
                    node_key=_, parent_key=0, token_i=0, matches=[], matched_so_far=""
                )
            )
        continue_processing = True
        while continue_processing and len(stack) > 0:
            iteration = stack.popleft()
            (node_key, parent_key, token_i, matches, matched_so_far) = iteration
            node = graph.node[node_key]
            node_type = node["type"]
            # logger.debug(iteration)
            # ---- check if accepting ----
            if _is_accepting(node):

                if token_i == len(tokens) - 1:  # at final whitespace

                    # add feet to scan
                    if show_feet:
                        # this will raise an error if find_feet is
                        # not set for the Scanner.
                        matched_so_far = self._find_feet(matched_so_far)
                    scan_result = ScanResult(
                        scan=matched_so_far,
                        matches=matches,
                        meter_key=node.get("meter_key"),
                    )
                    completed_scans.append(scan_result)
                    # logger.debug('completed scan: %s' % str(scan_result))
                    if first_only:
                        continue_processing = False
                continue
            # ---- otherwise, check that node matches here ----
            if node_type == "=":
                parser = special_parser() or self._long_parser
            elif node_type == "-" or node_type == "_":
                parser = special_parser() or self._short_parser

            assert parser

            rules_matched = parser.match_at(token_i, tokens, match_all=True)
            if not rules_matched:
                continue
            # tokens have been matched for this node, so process its
            # children.
            # logger.debug(
            #    'Rules # %s of parser rule matched ' % rules_matched +
            #    'at node %s of type %s ' % (node_key, node_type)
            # )
            children = graph.edge[node_key]
            for rule_key in reversed(rules_matched):
                rule = parser.rules[rule_key]
                for child_key in children:
                    # store data about current match for this node
                    # This includes the original tokens matched by the
                    # transcription parser.

                    # Retrieve and flatten original tokens.
                    orig_tokens = list(
                        itertools.chain.from_iterable(
                            [
                                transcription_tokens[i]
                                for i in range(token_i, token_i + len(rule.tokens))
                            ]
                        )
                    )

                    if graph_details:
                        match_data = NodeMatch(
                            type=node_type,
                            matched_tokens=rule.tokens,
                            parent_key=parent_key,
                            node_key=node_key,
                            orig_tokens=orig_tokens,
                            rule_found=rule.production,
                            # rule_key=rule_key,
                            token_i=token_i,
                        )
                    else:

                        match_data = UnitMatch(
                            type=node_type,
                            rule_found=rule.production,
                            orig_tokens=orig_tokens,
                        )

                    # add new scan iterations

                    stack.appendleft(
                        ScanIteration(
                            node_key=child_key,
                            parent_key=node_key,
                            token_i=token_i + len(rule.tokens),
                            matches=matches + [match_data],
                            matched_so_far=matched_so_far + node_type,
                        )
                    )
        if len(completed_scans) > 0 and self._post_scan_filter:
            completed_scans = self._post_scan_filter(completed_scans)

        return completed_scans

    @property
    def meters_list(self):
        """:obj:`list` of :obj:`dict`:: Meters list."""
        return self._meters_list

    @property
    def translation_graph(self):
        """:obj:`urdubiometer.DirectedGraph`:: Translation graph."""
        return self._translation_graph


# ---------- methods ----------


def _is_accepting(node):
    """Check if node is accepting."""
    return node.get("type") == "Accepting"