Source code for ipypublish.bib2glossary.classes

import copy
import io
import logging
import os

import bibtexparser

from ipypublish.bib2glossary.definitions import (
    ETYPE_GLOSS,
    ETYPE_ACRONYM,
    ETYPE_SYMBOL,
    NEWGLOSS_FIELDS,
    NEWACRONYM_FIELDS,
)

try:
    from collections.abc import MutableMapping
except ImportError:
    from collections import MutableMapping

logger = logging.getLogger(__name__)


[docs]class BibGlossEntry(object):
    _allowed_types = (ETYPE_GLOSS, ETYPE_ACRONYM, ETYPE_SYMBOL)

    def __init__(self, entry_dict):

        self._validate_dict(entry_dict)
        self._entry_dict = entry_dict

    def _validate_dict(self, dct):
        if "ID" not in dct:
            raise KeyError
        if "ENTRYTYPE" not in dct:
            raise KeyError

        if dct["ENTRYTYPE"] not in self._allowed_types:
            raise TypeError("ENTRYTYPE must be one of: {}".format(self._allowed_types))

        if dct["ENTRYTYPE"] == ETYPE_ACRONYM:
            if "abbreviation" not in dct or "longname" not in dct:
                raise KeyError
        elif dct["ENTRYTYPE"] == ETYPE_GLOSS or dct["ENTRYTYPE"] == ETYPE_SYMBOL:
            if "name" not in dct or "description" not in dct:
                raise KeyError

    def _get_key(self):
        return self._entry_dict["ID"]

    def _set_key(self, key):
        self._entry_dict["ID"] = key

    key = property(_get_key, _set_key)

    @property
    def type(self):
        return self._entry_dict["ENTRYTYPE"]

    def __contains__(self, key):
        return key in self._entry_dict

[docs]    def get(self, key):
        return self._entry_dict[key]

    @property
    def label(self):
        if self.type == ETYPE_ACRONYM:
            return self.get("abbreviation")
        elif self.type == ETYPE_GLOSS:
            return self.get("name")
        elif self.type == ETYPE_SYMBOL:
            return self.get("name")
        else:
            raise NotImplementedError

    @property
    def sortkey(self):
        if "sort" in self:
            return self.get("sort")
        else:
            return self.label.lower()

    @property
    def plural(self):
        if "plural" in self:
            return self.get("plural")
        else:
            return "{}s".format(self.label)

    @property
    def text(self):
        if self.type == ETYPE_ACRONYM:
            return self.get("longname")
        elif self.type == ETYPE_GLOSS:
            return self.get("description")
        elif self.type == ETYPE_SYMBOL:
            return self.get("description")
        else:
            raise NotImplementedError

    def __repr__(self):
        return "BibGlossEntry(key={0},label={1})".format(self.key, self.label)

[docs]    def to_dict(self):
        return copy.deepcopy(self._entry_dict)

[docs]    def to_latex(self):

        if self.type in [ETYPE_GLOSS, ETYPE_SYMBOL]:
            options = []
            for field in sorted(NEWGLOSS_FIELDS):
                if field in self:
                    options.append("{0}={{{1}}}".format(field, self.get(field)))
            if self.type == ETYPE_SYMBOL:
                options.append("type={symbols}")
            body = "{{{key}}}{{\n    {options}\n}}".format(
                key=self.key, options=",\n    ".join(options)
            )
            return "\\newglossaryentry" + body

        elif self.type == ETYPE_ACRONYM:
            body = "{{{key}}}{{{abbrev}}}{{{long}}}".format(
                key=self.key, abbrev=self.label, long=self.text
            )
            options = []
            for field in sorted(NEWACRONYM_FIELDS):
                if field in self:
                    options.append("{0}={{{1}}}".format(field, self.get(field)))
            if options:
                body = "[" + ",".join(options) + "]" + body

            return "\\newacronym" + body


[docs]class BibGlossDB(MutableMapping):
    def __init__(self):
        self._entries = {}

    def __getitem__(self, key):
        return self._entries[key]

    def __setitem__(self, key, entry):
        if not isinstance(entry, BibGlossEntry):
            raise ValueError("value must be a BibGlossEntry")
        if key != entry.key:
            raise ValueError("key must equal entry.key")
        self._entries[key] = entry

    def __delitem__(self, key):
        del self._entries[key]

    def __iter__(self):
        return iter(self._entries)

    def __len__(self):
        return len(self._entries)

[docs]    @staticmethod
    def get_fake_entry_obj(key):
        return BibGlossEntry(
            {"ENTRYTYPE": ETYPE_GLOSS, "ID": key, "name": key, "description": ""}
        )

[docs]    def load_bib(
        self,
        text_str=None,
        path=None,
        bibdb=None,
        encoding="utf8",
        ignore_nongloss_types=False,
        ignore_duplicates=False,
    ):
        """load a bib file

        Parameters
        ----------
        text_str=None: str or None
            string representing the bib file contents
        path=None: str or None
            path to bibfile
        bibdb=None: bibtexparser.bibdatabase.BibDatabase or None
        encoding="utf8": str
            bib file encoding
        ignore_nongloss_types: bool
            if False, a KeyError will be raised for non-gloss types
        ignore_duplicates: bool
            if False, a KeyError will be raised if multiple entries are found
            with the same key, otherwise only the first entry will be used

        """
        bib = None

        if sum([e is not None for e in [text_str, path, bibdb]]) != 1:
            raise ValueError("only one of text_str, path or bib must be supplied")
        if bibdb is not None:
            if not isinstance(bibdb, bibtexparser.bibdatabase.BibDatabase):
                raise ValueError("bib is not a BibDatabase instance")
            bib = bibdb
        elif path is not None:
            if text_str is not None:
                raise ValueError("text_str and path cannot be set at the same time")
            with io.open(path, encoding=encoding) as fobj:
                text_str = fobj.read()

        if bib is None:
            parser = bibtexparser.bparser.BibTexParser()
            parser.ignore_nonstandard_types = False
            parser.encoding = encoding
            bib = parser.parse(text_str)
            # TODO doesn't appear to check for key duplication
            # see https://github.com/sciunto-org/python-bibtexparser/issues/237

        entries = {}
        for entry_dict in bib.entries:

            try:
                entry = BibGlossEntry(entry_dict)
            except TypeError:
                if ignore_nongloss_types:
                    logger.warning("Skipping non-glossary entry")
                    continue
                else:
                    raise

            if entry.key in entries:
                if ignore_duplicates:
                    logger.warning("Skipping duplicate key entry")
                    continue
                else:
                    raise KeyError(
                        "the bib file contains "
                        "multiple entries with the key: {}".format(entry.key)
                    )

            entries[entry.key] = entry

        # self._bib = bib
        self._entries = entries

        return True

[docs]    def load_tex(
        self,
        text_str=None,
        path=None,
        encoding="utf8",
        skip_ioerrors=False,
        ignore_unknown_types=True,
    ):
        """load a tex file

        Parameters
        ----------
        text_str=None: str or None
            string representing the bib file contents
        path=None: str or None
            path to bibfile
        bibdb=None: bibtexparser.bibdatabase.BibDatabase or None
        encoding="utf8": str
            bib file encoding
        skip_ioerrors: bool
            if False, an IOError will be raised if
            newglossaryterm or newacronym is badly formatted
        ignore_unknown_types: bool
            if True, strip unknown types, otherwise raise a ValueError

        Notes
        -----
        the texsoup package is required.

        if a newglossaryterm has field 'type={symbols}', then
        it will be loaded as a symbol

        """
        from ipypublish.bib2glossary.parse_tex import parse_tex

        gterms, acronyms = parse_tex(
            text_str=text_str, path=path, encoding=encoding, skip_ioerrors=skip_ioerrors
        )
        entries = {}
        for key, fields in gterms.items():

            fields["ENTRYTYPE"] = ETYPE_GLOSS
            if fields.get("type", None) == "symbols":
                fields["ENTRYTYPE"] = ETYPE_SYMBOL
                fields.pop("type")
            elif "type" in fields:
                if not ignore_unknown_types:
                    raise ValueError(
                        "the 'type' is not recognised: " "{}".format(fields["type"])
                    )
                fields.pop("type")

            fields["ID"] = key
            entry = BibGlossEntry(fields)
            entries[entry.key] = entry

        for key, fields in acronyms.items():
            fields["ENTRYTYPE"] = ETYPE_ACRONYM
            fields["ID"] = key
            entry = BibGlossEntry(fields)
            entries[entry.key] = entry

        self._entries = entries

        return True

[docs]    @staticmethod
    def guess_path(path):
        """ guess the path of a bib file, with or without a file extension,
        from the available files in the path folder
        """
        basepath, extension = os.path.splitext(str(path))
        if extension in [".bib", ".biblatex", ".bibtex"]:
            return path
        elif extension in [".tex", ".latex"]:
            return path
        elif os.path.exists(basepath + ".bib"):
            return basepath + ".bib"
        elif os.path.exists(basepath + ".bibtex"):
            return basepath + ".bibtex"
        elif os.path.exists(basepath + ".biblatex"):
            return basepath + ".biblatex"
        elif os.path.exists(basepath + ".tex"):
            return basepath + ".tex"
        elif os.path.exists(basepath + ".latex"):
            return basepath + ".latex"
        else:
            return None

[docs]    def load(self, path, encoding="utf8"):
        """load a file, the type will be guessed from the extension,
        or (if no extension is given), the available files in the path folder

        Parameters
        ----------
        path: str
        encoding='utf8': str
            encoding of the file

        """
        path = self.guess_path(path)
        if path is None:
            raise IOError("no acceptable loader found for path: {}".format(path))
        basepath, extension = os.path.splitext(str(path))
        if extension in [".bib", ".biblatex", ".bibtex"]:
            self.load_bib(path=path, encoding=encoding)
        elif extension in [".tex", ".latex"]:
            self.load_tex(path=path, encoding=encoding)

[docs]    def to_dict(self):
        return {k: e.to_dict() for k, e in self.items()}

[docs]    def to_bib_string(self):
        bibdb = bibtexparser.bibdatabase.BibDatabase()
        bibdb.entries = [e.to_dict() for e in self.values()]
        writer = bibtexparser.bwriter.BibTexWriter()
        writer.contents = ["comments", "entries"]
        writer.indent = "  "
        # writer.order_entries_by = ('ENTRYTYPE', 'ID')
        return writer.write(bibdb)

[docs]    def to_latex_dict(self, splitlines=True):
        """convert to dict of latex strings

        Returns
        -------
        dict:
            {(<type>, <key>): <latex string>}

        """
        latex_stings = {}
        for entry in self.values():
            string = entry.to_latex()
            if splitlines:
                string = string.splitlines()
            latex_stings[(entry.type, entry.key)] = string
        return latex_stings

[docs]    def to_latex_string(self):
        lines = []
        latex_dict = self.to_latex_dict(splitlines=False)
        for key in sorted(list(latex_dict.keys())):
            lines.append(latex_dict[key])
        return "\n".join(lines)