esl-reader/plugins/merriam-webster.py

from importlib.abc import InspectLoader
from PyQt6.QtGui import QColor, QFont
from trycast import trycast
import json
import re
from typing import Any, Literal, NotRequired, TypedDict, cast

from PyQt6.QtCore import QEventLoop, QUrl, Qt
from PyQt6.QtNetwork import QNetworkRequest
from lib.utils import Resources
from lib.definition import Line, Fragment

registration = {
    'source': 'mw',
    'name': 'Merriam-Webster',
}

API = "https://www.dictionaryapi.com/api/v3/references/collegiate/json/{word}?key={key}"
key = "51d9df34-ee13-489e-8656-478c215e846c"

class Meta(TypedDict):
    id: str
    uuid: str
    sort: str
    src: str
    section: str
    stems: list[str]
    offensive: bool

class Sound(TypedDict):
    audio: str
    ref: str
    stat: str

class Pronunciation(TypedDict):
    mw: str
    l: NotRequired[str]
    l2: NotRequired[str]
    pun: NotRequired[str]
    sound: NotRequired[Sound]

class SubSource(TypedDict):
    source: NotRequired[str]
    aqdate: NotRequired[str]

class AttributionOfQuote(TypedDict):
    auth: NotRequired[str]
    source: NotRequired[str]
    aqdate: NotRequired[str]
    subsource: NotRequired[SubSource]

class VerbalIllustration(TypedDict):
    t: str
    aq: NotRequired[AttributionOfQuote]

class HeadWordInformation(TypedDict):
    hw: str
    prs: NotRequired[list[Pronunciation]]

class AlternanteHeadword(TypedDict):
    hw: str
    psl: NotRequired[str]

class Variant(TypedDict):
    va: str
    vl: NotRequired[str]
    prs: NotRequired[list[Pronunciation]]
    spl: NotRequired[str]

Inflection =TypedDict('Inflection', {
    'if': NotRequired[str],
    'ifc': NotRequired[str],
    'il': NotRequired[str],
    'prs': NotRequired[list[Pronunciation]],
    'spl': NotRequired[str]
    })

class CrossReferenceTarget(TypedDict):
    cxl: str
    cxr: NotRequired[str]
    cxt: str
    cxn: NotRequired[str]

class CognateCrossRef(TypedDict):
    cxl: str
    cxtis: list[CrossReferenceTarget]

class Pair(TypedDict):
    objType: str
    obj: Any

class DividedSense(TypedDict):
    sd: str
    dt: list[list[Pair]]
    et: NotRequired[list[Pair]]
    ins: NotRequired[list[Inflection]]
    lbs: NotRequired[list[str]]
    prs: NotRequired[list[Pronunciation]]
    sgram: NotRequired[str]
    sls: NotRequired[list[str]]
    vrs: NotRequired[list[Variant]]

class Sense(TypedDict):
    dt: list[list[Pair]]
    et: NotRequired[list[Pair]]
    ins: NotRequired[list[Inflection]]
    lbs: NotRequired[list[str]]
    prs: NotRequired[list[Pronunciation]]
    sdsense: NotRequired[DividedSense]
    sgram: NotRequired[str]
    sls: NotRequired[list[str]]
    sn: NotRequired[str]
    vrs: NotRequired[list[Variant]]

class TruncatedSense(Sense): pass

class BindingSubstitutePair(TypedDict):
    objType: Literal['bs']
    obj: Sense

class SensePair(TypedDict):
    objType: Literal['sense']
    obj: Sense

class DefinitionSection(TypedDict):
    vd: NotRequired[str]
    sls: NotRequired[list[str]]
    sseq: Any # list[list[Pair]]

Definition =TypedDict('Definition', {
    'meta': Meta,
    'hom': NotRequired[int],
    'hwi': HeadWordInformation,
    'ahws': NotRequired[list[AlternanteHeadword]],
    'vrs': NotRequired[list[Variant]],
    'fl': str,
    'lbs': NotRequired[list[str]],
    'sls': NotRequired[list[str]],
    'ins': NotRequired[list[Inflection]],
    'cxs': NotRequired[list[CognateCrossRef]],
    'def': list[DefinitionSection],
})

def make_pairs(src: list[Any]) -> list[Pair]:
    result:list[Pair] = []
    iters = [iter(src)]*2
    for entry in zip(*iters):
        pair0 = { 'objType': entry[0],
                 'obj': entry[1],
                }
        if isinstance(pair0['obj'], list):
            result.append(cast(Pair,pair0))
            continue
        pair1 = trycast(Pair, pair0)
        if pair1 is None:
            print(pair0['objType'], type(pair0['obj']),
                  json.dumps(pair0['obj'],indent=2)
                )
        assert pair1 is not None
        result.append(pair1)
    return result

Elements = [ 'dt', 'sen', 'bs', 'pseq', 'snot', 't', 'text', 'vis', 'sens', 'uns', 'sense' ]
def restructure(obj: Any) -> Any:
    if isinstance(obj, list):
        if len(obj) == 0:
            return []
        if isinstance(obj[0], str) and obj[0] in Elements:
            pairs = make_pairs(obj)
            result = []
            for pair in pairs:
                if isinstance(pair['obj'], list):
                    r2 = []
                    for item in pair['obj']:
                        r2.append(restructure(item))
                    pair['obj'] = r2
                elif isinstance(pair['obj'], dict):
                    r2 = {}
                    for k,v in pair['obj'].items():
                        r2[k] = restructure(v)
                    pair['obj'] = r2
                result.append(pair)
            return result
        result = []
        for v in obj:
            result.append(restructure(v))
        return result
    elif isinstance(obj, dict):
        obj2 = cast(dict, obj)
        result = {}
        for k,v in obj2.items():
            result[k] = restructure(v)
        return result
    else:
        return obj

class WordType(TypedDict):
    word: str
    source: str
    definition: Any

def fetch(word:str) ->  WordType:
    request = QNetworkRequest()
    url = QUrl(API.format(word=word, key=key))
    request.setUrl(url)
    request.setTransferTimeout(3000)
    reply = Resources.nam.get(request)
    assert reply is not None
    loop = QEventLoop()
    reply.finished.connect(loop.quit)
    loop.exec()
    content = reply.readAll()
    data = json.loads(content.data().decode('utf-8'))
    return {
        'word': word,
        'source': 'mw',
        'definition': data,
    }

def soundUrl(sound:Sound, fmt='ogg') -> QUrl:
    """Create a URL from a PRS structure."""
    base = f"https://media.merriam-webster.com/audio/prons/en/us/{fmt}"
    audio = sound['audio']
    m = re.match(r"(bix|gg|[a-zA-Z])", audio)
    if m:
        url = base + f"/{m.group(1)}/"
    else:
        url = base + "/number/"
    url += audio + f".{fmt}"
    return QUrl(url)

def getFirstSound(definition: Any) -> QUrl:
    #  ahws, cats, dros, hwi, ins, ri, sdsense, sen, sense, uros, vrs
    for entry in definition:
        for v in entry.values():
            hwi = v # trycast
            if hwi is None:
                continue
            if 'prs' in hwi:
                for pr in hwi['prs']:
                    if 'sound' in pr:
                        url = soundUrl(pr['sound'])
                        if url.isValid():
                            return url
    return QUrl()

def do_prs(hwi: Any) -> list[Fragment]:
    r = Resources()
    frags: list[Fragment] = []
    font = r.labelFont
    linkColor = r.linkColor
    subduedColor = r.subduedColor

    if 'prs' not in hwi:
        return []
    for pr in hwi['prs']:
        if 'pun' in pr:
            pun = pr['pun']
        else:
            pun = ' '
        if 'l' in pr:
            frags.append(
                Fragment(pr['l'] + pun, font, color=subduedColor)
            )
        frag = Fragment(pr['mw'], font, color=subduedColor)
        if 'sound' in pr:
            frag.setAudio(soundUrl(pr['sound']))
            frag.setColor(linkColor)
        frags.append(frag)
        if 'l2' in pr:
            frags.append(
                Fragment(pun + pr['l2'], font, color=subduedColor)
            )
    return frags

def do_aq(aq: AttributionOfQuote|None) -> list[Line]:
    assert aq is not None
    return []

def do_vis(vis: list[VerbalIllustration]|None,indent=0) -> list[Line]:
    assert vis is not None
    r = Resources()
    lines: list[Line] = []
    for vi in vis:
        line = Line()
        frag = Fragment(vi['t'], r.textFont, color=r.baseColor)
        if indent > 0:
            frag.setIndent(indent)
        line.addFragment(frag)
        lines.append(line)
        if 'aq' in vi:
            lines += do_aq(trycast(AttributionOfQuote, vi['aq']))
    return []

def do_uns(uns: list[list[list[Pair]]]|None, indent:int) -> list[Line]:
    assert uns is not None
    r = Resources()
    lines:list[Line] = []
    for note in uns:
        for entry in note:
            for pair in entry:
                if pair['objType'] == 'text':
                    frag = Fragment(' \u2014'+pair['obj'], r.textFont, color=r.baseColor)
                    frag.setIndent(indent)
                    line = Line()
                    line.addFragment(frag)
                    lines.append(line)
                elif pair['objType'] == 'vis':
                    lines += do_vis(trycast(list[VerbalIllustration], pair['obj']), indent)
                elif pair['objType'] == 'ri':
                    raise NotImplementedError("NO ri")
    return lines

def do_dt(dt: list[list[Pair]]|None, indent: int) -> tuple[list[Fragment], list[Line]]:
    assert dt is not None
    frags: list[Fragment] = []
    lines: list[Line] = []
    r = Resources()
    first = True
    for entry in dt:
        for pair in entry:
            if pair['objType'] == 'text':
                frag = Fragment(pair['obj'], r.textFont, color=r.baseColor)
                frag.setIndent(indent)
                if first:
                    frags.append(frag)
                else:
                    line = Line()
                    line.addFragment(frag)
                    lines.append(line)
            elif pair['objType'] == 'vis':
                lines += do_vis(trycast(list[VerbalIllustration], pair['obj']))
            elif pair['objType'] == 'uns':
                newLines = do_uns(trycast(list[list[list[Pair]]], pair['obj']),indent)
                lines += newLines
            else:
                print(json.dumps(pair, indent=2))
                raise NotImplementedError(f"Unknown or unimplimented element {pair['objType']}")
        first = False
    return (frags, lines)

def do_sense(sense: Sense|None, indent:int=3) -> tuple[list[Fragment], list[Line]]:
    if sense is None:
        return ([],[])
    lines: list[Line] = []
    frags: list[Fragment] = []
    r = Resources()
    dt = sense['dt']
    (newFrags, newLines) = do_dt(trycast(list[list[Pair]], dt),indent)
    frags += newFrags
    lines += newLines
    for k,v in sense.items():
        if k == 'dt' or k == 'sn':
            continue
        elif k == 'sdsense':
            # XXX - This needs to expand to handle et, ins, lbs, prs, sgram, sls, vrs
            sdsense = trycast(DividedSense, v)
            assert sdsense is not None
            print(r.italicFont.toString())
            frag = Fragment(sdsense['sd']+' ', r.italicFont, color=r.baseColor)
            frag.setIndent(indent)
            line = Line()
            line.addFragment(frag)
            (newFrags, newLines) = do_dt(trycast(list[list[Pair]], sdsense['dt']), indent=indent)
            line.addFragment(newFrags)
            lines.append(line)
            lines += newLines
        elif k == 'sls':
            labels = trycast(list[str], v)
            assert labels is not None
            frag = Fragment(", ".join(labels)+' ', r.textFont, color=r.subduedColor)
            frags.append(frag)
        else:
            print(k,v)
            raise NotImplementedError(f"Unknown or unimplimented element {k}")
    return (frags, lines)

def do_pseq(outer: int,
            inner: int,
            pseq: list[Any] ) -> tuple[list[Fragment], list[Line]]:
    lines: list[Line] = []
    frags: list[Fragment] = []
    count = 1
    r = Resources()
    first = True
    for entry in pseq:
        for pair in entry:
            if pair['objType'] == 'bs':
                sense = pair['obj']['sense']
                (newFrags, newLines) = do_sense(trycast(Sense, sense))
                if first:
                    frags += newFrags
                    first = False
                else:
                    line = Line()
                    line.addFragment(newFrags)
                    lines.append(line)
                lines += newLines
            elif pair['objType'] == 'sense':
                if first:
                    frag = Fragment(f"({count})", r.textFont, color=r.baseColor)
                    frag.setIndent(3)
                    frags.append(frag)
                    (newFrags, newLines) = do_sense(trycast(Sense, pair['obj']), indent=4)
                    frags += newFrags
                    first = False
                else:
                    line = Line()
                    frag = Fragment(f"({count})", r.textFont, color=r.baseColor)
                    frag.setIndent(3)
                    line.addFragment(frag)
                    (newFrags, newLines) = do_sense(trycast(Sense, pair['obj']), indent=4)
                    line.addFragment(newFrags)
                    lines.append(line)
                lines += newLines
                count += 1
            else:
                raise NotImplementedError(f"Unknown object type {pair['objType']}")
    return (frags, lines)

def do_sseq(sseq:list[list[list[Pair]]]) -> list[Line]:
    lines: list[Line] = []
    r = Resources()
    for outer, item_o in enumerate(sseq):
        line = Line()
        frag =Fragment(str(outer+1), r.boldFont, color=r.baseColor)
        frag.setIndent(1)
        line.addFragment(frag)
        for inner, item_i in enumerate(item_o):
            frag =Fragment(chr(ord('a')+inner), r.boldFont, color=r.baseColor)
            frag.setIndent(2)
            line.addFragment(frag)
            for pair in item_i:
                objType = pair['objType']
                if objType == 'sense':
                    sense = trycast(Sense, pair['obj'])
                    (frags, newlines) = do_sense(sense)
                    line.addFragment(frags)
                    lines.append(line)
                    line = Line()
                    lines += newlines
                elif objType == 'sen':
                    raise NotImplementedError(f"sen unimplimented")
                elif objType == 'pseq':
                    (frags, newlines) = do_pseq(inner, outer, pair['obj'])
                    line.addFragment(frags)
                    lines.append(line)
                    line = Line()
                    lines += newlines
                elif objType == 'bs':
                    sense = pair['obj']['sense']
                    (newFrags, newLines) = do_sense(trycast(Sense, sense))
                    line.addFragment(newFrags)
                    lines.append(line)
                    line = Line()
                    lines += newLines
                else:
                    raise NotImplementedError(f"Unknown object[{objType}] for \n{json.dumps(pair['obj'],indent=2)}")
    return lines

def do_def(entry: DefinitionSection) -> list[Line]:
    assert entry is not None
    r = Resources()
    lines: list[Line] = []
    if 'vd' in entry:
        line = Line()
        line.addFragment(
            Fragment(entry['vd'], r.italicFont, color = r.linkColor)
        )
        lines.append(line)
    #
    # sseg is required
    #
    sseq = entry['sseq']
    lines += do_sseq(sseq)
    return lines

def getDef(defines: Any) -> list[Line]:
    Line.setParseText(parseText)
    workList = restructure(defines)
    workList = trycast(list[Definition], workList)
    assert workList is not None
    r = Resources()
    lines:list[Line] = []

    #
    # No need to figure it out each time it is used
    #
    entries = 0
    id = workList[0]['meta']['id'].lower().split(':')[0]
    uses: dict[str,int] = {}
    for entry in workList:
        testId = entry['meta']['id'].lower().split(':')[0]
        if testId == id:
            entries += 1
            #
            # If there is a Functional Lable, then we are going
            # to capture the count of each FL
            #
            try:
                uses[entry['fl']] = uses.get(entry['fl'], 0) + 1
            except KeyError:
                pass
    del(entry)
    used: dict[str, int] = {}
    for k in uses.keys():
        used[k] = 0

    for count, work in enumerate(workList):
        testId = work['meta']['id'].lower().split(':')[0]
        #
        # Skip entries which are not part of the primary definition
        #
        if testId != id:
            continue
        #
        # Create the First line from the hwi, [ahws] and fl
        #
        line = Line()
        hwi = trycast(HeadWordInformation, work['hwi'])
        assert hwi is not None
        hw = re.sub(r'\*', '', hwi['hw'])
        line.addFragment(Fragment(hw, r.headerFont, color=r.baseColor))
        if 'ahws' in work:
            ahws = trycast(list[AlternanteHeadword], work['ahws'])
            assert ahws is not None
            for ahw in ahws:
                hw = re.sub(r'\*', '', ahw['hw'])
                line.addFragment(Fragment(', ' + hw, r.headerFont, color=r.baseColor))
        if  entries > 1:
            frag = Fragment(f" {count + 1} of {entries} ", r.textFont, color= r.subduedColor)
            # XXX - Use a resource color!!!
            frag.setBackground(QColor(Qt.GlobalColor.gray))
            line.addFragment(frag)
        if 'fl' in work:
            text = work['fl']
            used[text] += 1
            if uses[text] > 1:
                text += f' ({used[text]})'
            line.addFragment(Fragment(text, r.labelFont, color=r.baseColor))
        lines.append(line)

        #
        # Next is the pronunciation.
        # While 'prs' is optional, the headword is not.  This gets us what we want.
        #
        line = Line()
        if hwi['hw'].find('*') >= 0:
            hw = re.sub(r'\*', '\u00b7', hwi['hw'])
            line.addFragment(Fragment(hw + ' ', r.textFont, color=r.subduedColor))
        for frag in do_prs(hwi):
            line.addFragment(frag)
        if len(line.getLine()) > 0:
            lines.append(line)
        defines = trycast(list[DefinitionSection], work['def'])
        assert defines is not None
        for define in defines:
            try:
                lines += do_def(define)
            except NotImplementedError as e:
                print(e)
    return lines

def parseText(frag: Fragment) -> list[Fragment]:
    org = frag.text()
    if frag.asis():
        return [frag]

    #
    # Get the fonts we might need.
    # We can't use Resources() because we don't know the original font.
    textFont = frag.font()
    textFont.setWeight(QFont.Weight.Normal)
    textFont.setItalic(False)
    textFont.setCapitalization(QFont.Capitalization.MixedCase)
    boldFont = QFont(textFont)
    boldFont.setBold(True)
    italicFont = QFont(textFont)
    italicFont.setItalic(True)
    smallCapsFont = QFont(textFont)
    smallCapsFont.setCapitalization(QFont.Capitalization.SmallCaps)
    scriptFont = QFont(textFont)
    scriptFont.setPixelSize(int(scriptFont.pixelSize()/4))
    boldItalicFont = QFont(boldFont)
    boldItalicFont.setItalic(True)
    boldSmallCapsFont = QFont(smallCapsFont)
    boldSmallCapsFont.setBold(True)
    capsFont = QFont(textFont)
    capsFont.setCapitalization(QFont.Capitalization.AllUppercase)
    #
    # Default color:
    #
    baseColor = frag.color()
    r = Resources()

    results: list[Fragment] = []
    while True:
        text = frag.text()
        start = text.find("{")
        if start < 0:
            results.append(frag)
            return results
        if start > 0:
            newFrag = Fragment(frag)
            newFrag.setText(text[:start])
            results.append(newFrag)
            frag.setText(text[start:])
            continue
        #
        # Start == 0
        #

        #
        # If the token is an end-token, return now.
        #
        if text.startswith("{/"):
            results.append(frag)
            return results

        #
        # extract this token
        #
        end = text.find("}")
        token = text[1:end]
        frag.setText(text[end + 1 :])
        newFrag = Fragment(frag)
        oldFont = QFont(frag.font())
        if token == "bc":
            results.append(Fragment(": ", boldFont, color=baseColor))
            continue
        if token in [
            "b",
            "inf",
            "it",
            "sc",
            "sup",
            "phrase",
            "parahw",
            "gloss",
            "qword",
            "wi",
            "dx",
            "dx_def",
            "dx_ety",
            "ma",
        ]:
            if token == "b":
                frag.setFont(boldFont)
            elif token in ["it", "qword", "wi"]:
                frag.setFont(italicFont)
            elif token == "sc":
                frag.setFont(smallCapsFont)
            elif token in ["inf", "sup"]:
                frag.setFont(scriptFont)
            elif token == "phrase":
                frag.setFont(boldItalicFont)
            elif token == "parahw":
                frag.setFont(boldSmallCapsFont)
            elif token == "gloss":
                frag.setText("[" + frag.text())
            elif token in ["dx", "dx_ety"]:
                frag.setText("\u2014" + frag.text())
            elif token == "ma":
                frag.setText("\u2014 more at " + frag.text())
            elif token == "dx_def":
                frag.setText("(" + frag.text())
            else:
                raise NotImplementedError(f"Unknown block marker: {token}")
            results += parseText(frag)
            frag = results.pop()
            frag.setFont(oldFont)
            text = frag.text()
            if not text.startswith("{/" + token + "}"):
                raise NotImplementedError(
                    f"No matching close for {token} in {org}"
                )
            if token == "gloss":
                results[-1].setText(results[-1].text() + "]")
            elif token == "dx_def":
                results[-1].setText(results[-1].text() + ")")
            end = text.find("}")
            text = text[end + 1 :]
            frag.setText(text)
            continue
        #
        # These are codes that include all information within the token
        #
        fields = token.split("|")
        token = fields[0]
        if token in [
            "a_link",
            "d_link",
            "dxt",
            "et_link",
            "i_link",
            "mat",
            "sx",
        ]:
            wref = ""
            htext = fields[1]
            oldFont = QFont(frag.font())
            target = "word"
            if token == "a_link":
                wref = fields[1]
            elif token in ["d_link", "et_link", "mat", "sx", "i_link"]:
                if fields[2] == "":
                    wref = fields[1]
                else:
                    wref = fields[2]
                if token == "i_link":
                    frag.setFont(italicFont)
                elif token == "sx":
                    frag.setFont(capsFont)
            elif token == "dxt":
                if fields[3] == "illustration":
                    wref = fields[2]
                    target = "article"
                elif fields[3] == "table":
                    wref = fields[2]
                    target = "table"
                elif fields[3] != "":
                    wref = fields[3]
                    target = "sense"
                else:
                    wref = fields[1]
                    target = "word"
            elif token == "a_link":
                target = "word"
                wref = fields[1]
            else:
                raise NotImplementedError(f"Unknown code: {token} in {org}")
            newFrag = Fragment(frag)
            newFrag.setText(htext)
            newFrag.setWRef(wref)
            newFrag.setTarget(target)
            newFrag.setColor(r.linkColor)
            results.append(newFrag)
            frag.setFont(oldFont)
            text = frag.text()
            continue
        raise NotImplementedError(
            f"Unable to locate a known token {token} in {org}"
        )