Source code for ipypublish.filters_pandoc.utils

from collections import OrderedDict
import copy
import io
import json
import re

from six import string_types
from nbconvert.utils.pandoc import get_pandoc_version
from distutils.version import LooseVersion
import panflute as pf

from panflute import Element, Doc  # noqa: F401
from types import FunctionType  # noqa: F401

from ipypublish.filters_pandoc.definitions import IPUB_META_ROUTE


[docs]def apply_filter( in_object, filter_func=None, out_format="panflute", in_format="markdown", strip_meta=False, strip_blank_lines=False, replace_api_version=True, dry_run=False, **kwargs ): # type: (list[str], FunctionType) -> str """convenience function to apply a panflute filter(s) to a string, list of string lines, pandoc AST or panflute.Doc Parameters ---------- in_object: str or list[str] or dict can also be panflute.Doc filter_func: the filter function or a list of filter functions out_format: str for use by pandoc or, if 'panflute', return the panflute.Doc in_format="markdown": str strip_meta=False: bool strip the document metadata before final conversion strip_blank_lines: bool strip_ends: bool strip any blank lines or space from the start and end replace_api_version: bool for dict input only, if True, find the api_version of the available pandoc and reformat the json as appropriate dry_run: bool If True, return the Doc object, before applying the filter kwargs: to parse to filter func Returns ------- str """ if isinstance(in_object, pf.Doc): pass elif isinstance(in_object, dict): if not in_format == "json": raise AssertionError( "the in_format for a dict should be json, " "not {}".format(in_format) ) if "meta" not in in_object: raise ValueError("the in_object does contain a 'meta' key") if "blocks" not in in_object: raise ValueError("the in_object does contain a 'blocks' key") if "pandoc-api-version" not in in_object: raise ValueError("the in_object does contain a 'pandoc-api-version' key") if replace_api_version: # run pandoc on a null object, to get the correct api version null_raw = pf.run_pandoc("", args=["-t", "json"]) null_stream = io.StringIO(null_raw) api_version = pf.load(null_stream).api_version # see panflute.load, w.r.t to legacy version if api_version is None: in_object = [{"unMeta": in_object["meta"]}, in_object["blocks"]] else: ans = OrderedDict() ans["pandoc-api-version"] = api_version ans["meta"] = in_object["meta"] ans["blocks"] = in_object["blocks"] in_object = ans in_str = json.dumps(in_object) elif isinstance(in_object, (list, tuple)): in_str = "\n".join(in_object) elif isinstance(in_object, string_types): in_str = in_object else: raise TypeError("object not accepted: {}".format(in_object)) if not isinstance(in_object, pf.Doc): doc = pf.convert_text(in_str, input_format=in_format, standalone=True) # f = io.StringIO(in_json) # doc = pf.load(f) else: doc = in_object doc.format = out_format if dry_run: return doc if not isinstance(filter_func, (list, tuple, set)): filter_func = [filter_func] out_doc = doc for func in filter_func: out_doc = func(out_doc, **kwargs) # type: Doc # post-process Doc if strip_meta: out_doc.metadata = {} if out_format == "panflute": return out_doc # create out str # with io.StringIO() as f: # pf.dump(doc, f) # jsonstr = f.getvalue() # jsonstr = json.dumps(out_doc.to_json() out_str = pf.convert_text( out_doc, input_format="panflute", output_format=out_format ) # post-process final str if strip_blank_lines: out_str = out_str.replace("\n\n", "\n") return out_str
[docs]def compare_version(target, comparison): """Set docstring here. Parameters ---------- target: str target version of pandoc comparison: str one of '>', '<', '<=', '>=', '==' Returns ------- bool """ # TODO this only works if you are # converting json in the same environment # from pandocxnos import init as get_pandoc_version version = LooseVersion(get_pandoc_version()) required = LooseVersion(target) if comparison == ">=": return version >= required elif comparison == "<=": return version <= required elif comparison == ">": return version > required elif comparison == "<": return version < required elif comparison == "==": return version == required else: raise ValueError("comparison not recognised: {}".format(comparison))
[docs]def strip_quotes(string): # type: (str) -> str if string.startswith("'") and string.endswith("'"): string = string[1:-1] if string.startswith('"') and string.endswith('"'): string = string[1:-1] return string
[docs]def find_attributes( element, allow_space=True, search_left=False, include_element=False ): """find an attribute 'container' for an element, of the form <element><space>{#id .class1 .class2 a=1 b="a string"} and extract its content Parameters ---------- element: the element to find attributes for allow_space=True: bool whether to allow space between the element and attribute container search_left=False: bool search to the left of the element, rather than the right include_element=False: bool whether to include the element in the search Returns ------- dict or None: {"classes": list[str], "attributes": dict[str], "id": str, "elements": list[Element]}, where elements is the elements containing the attributes (including space) """ if search_left: return _search_attribute_left(element, include_element, allow_space) else: return _search_attribute_right(element, include_element, allow_space)
def _search_attribute_right(element, include_element, allow_space): if (not element.next) and not include_element: return None if include_element: adjacent = element else: adjacent = element.next attr_elements = [] found_start = False found_end = False while adjacent: if isinstance(adjacent, pf.Space) and allow_space: attr_elements.append(adjacent) adjacent = adjacent.next continue elif ( isinstance(adjacent, pf.Str) # and adjacent.text.startswith("{") # and adjacent.text.endswith("}")): and re.search(r"^\{[^}]*\}", adjacent.text) ): # TODO this won't handle } in strings, e.g. {a="} "} found_start = True found_end = True attr_elements.append(adjacent) break elif ( isinstance(adjacent, pf.Str) # and adjacent.text.startswith("{")): and re.search(r"^[^\}]*\{", adjacent.text) ): found_start = True found_end = False attr_elements.append(adjacent) break break # adjacent = adjacent.next if found_start and not found_end: adjacent = adjacent.next while adjacent: if ( isinstance(adjacent, pf.Str) # and adjacent.text.endswith("}")): and re.search(r"^[^\{]*\}", adjacent.text) ): # TODO this won't handle } in strings, e.g. {a="} "} found_end = True attr_elements.append(adjacent) break else: attr_elements.append(adjacent) adjacent = adjacent.next if not (found_start and found_end): return None attribute_str = pf.stringify(pf.Para(*attr_elements)).replace("\n", " ").strip() # split into the label and the rest match = re.match(r"^\{(#[^\s]+|)([^\}]*)\}", attribute_str) if not match: raise ValueError(attribute_str) classes, attributes = process_attributes(match.group(2)) new_str = attribute_str[len(match.group(0)) :] return { "id": match.group(1)[1:], "classes": classes, "attributes": attributes, "elements": attr_elements, "append": pf.Str(new_str) if new_str else None, } def _search_attribute_left(element, include_element, allow_space): if (not element.prev) and not include_element: return None if include_element: adjacent = element else: adjacent = element.prev attr_elements = [] found_start = False found_end = False while adjacent: if isinstance(adjacent, pf.Space) and allow_space: attr_elements.append(adjacent) adjacent = adjacent.prev continue elif ( isinstance(adjacent, pf.Str) and adjacent.text.endswith("}") and adjacent.text.startswith("{") ): # TODO this won't handle } in strings, e.g. {a="} "} # TODO this won't handle characters after } e.g. {a=1}) found_start = True found_end = True attr_elements.append(adjacent) break elif isinstance(adjacent, pf.Str) and adjacent.text.endswith("}"): found_start = False found_end = True attr_elements.append(adjacent) break break # adjacent = adjacent.prev if found_end and not found_start: adjacent = adjacent.prev while adjacent: if isinstance(adjacent, pf.Str) and adjacent.text.startswith("{"): # TODO this won't handle { in strings, e.g. {a="{ "} # TODO this won't handle characters before { e.g. ({a=1} found_start = True attr_elements.append(adjacent) break else: attr_elements.append(adjacent) adjacent = adjacent.prev if not (found_start and found_end): return None attr_elements = list(reversed(attr_elements)) attribute_str = pf.stringify(pf.Para(*attr_elements)).replace("\n", " ").strip() # split into the label and the rest match = re.match("^\\{(#[^\\s]+|)([^\\}]*)\\}$", attribute_str) if not match: raise ValueError(attribute_str) classes, attributes = process_attributes(match.group(2)) return { "id": match.group(1)[1:], "classes": classes, "attributes": attributes, "elements": attr_elements, "append": None, }
[docs]def process_attributes(attr_string): """process a string of classes and attributes, e.g. '.class-name .other a=1 b="some text"' will be returned as: ["class-name", "other"], {"a": 1, "b": "some text"} Returns: list: classes dict: attributes """ # find classes, denoted by .class-name classes = [c[1][1:] for c in re.findall("(^|\\s)(\\.[\\-\\_a-zA-Z]+)", attr_string)] # find attributes, denoted by a=b, respecting quotes attr = { c[1]: strip_quotes(c[2]) for c in re.findall( "(^|\\s)([\\-\\_a-zA-Z]+)\\s*=\\s*(\\\".+\\\"|\\'.+\\'|[^\\s\\\"\\']+)", # noqa: E501 attr_string, ) } # TODO this generally works, but should be stricter against any weird # fringe cases # TODO add tests return classes, attr
[docs]def convert_attributes(attr): """attempt to convert values to python types, e.g. float, list, dict""" attr = copy.deepcopy(attr) for key in list(attr.keys()): try: new_value = json.loads(attr[key]) attr[key] = new_value except Exception: pass return attr
[docs]def convert_units(string, out_units): match = re.compile("^\\s*([0-9]+\\.?[0-9]*)([a-z\\%]*)\\s*$").match(str(string)) if match is None: raise ValueError("string could not be resolved as a value: {}".format(string)) value = float(match.group(1)) in_units = match.group(2) in_units = "fraction" if not in_units else in_units if in_units == out_units: return value convert = { ("%", "fraction"): lambda x: x / 100.0, ("fraction", "%"): lambda x: x * 100.0, }.get((in_units, out_units), None) if convert is None: raise ValueError( "could not find a conversion for " "{0} to {1}: {2}".format(in_units, out_units, string) ) return convert(value)
[docs]def get_option(locations, keypath, default=None, delimiter=".", error_on_missing=False): """ fetch an option variable from a hierarchy of preferred locations The value returned will be from the first available location or the default Parameters ---------- locations: list[dict] a list of mappings to search in keypath: list[str] or str a key path to search in, if str, then split by delimiter default=None: object a default value to return delimiter: str if a str then the keypath is expected to be a str error_on_missing: bool raise KeyError if not found in any of the options Examples -------- >>> a = {"m": 1} >>> b = {"x": {"y": 2}} >>> c = {"x": {"y": 3}} >>> get_option([a, b, c], keypath=("x", "y")) 2 >>> get_option([a, c, b], keypath=("x", "y")) 3 >>> get_option([a, c, b], keypath="x.y") 3 >>> get_option([a, c, b], keypath="l", default=4) 4 """ if isinstance(keypath, string_types): keypath = keypath.split(delimiter) found_var = False variable = None for opt in locations: final_opt = opt found_key = True for key in keypath: try: final_opt = final_opt[key] except (KeyError, TypeError): found_key = False break if found_key: found_var = True variable = final_opt break if found_var: return variable elif error_on_missing: raise ValueError("could not retrieve the option keypath: {}".format(keypath)) return default
[docs]def create_ipub_meta(options): meta = {} submeta = meta for key in IPUB_META_ROUTE.split(".")[:-1]: submeta[key] = {} submeta = submeta[key] submeta[IPUB_META_ROUTE.split(".")[-1]] = options return meta
[docs]def get_panflute_containers(element): """return list of all possible container classes for an element""" panflute_inline_containers = ( pf.Cite, pf.Emph, pf.Header, pf.Image, pf.LineItem, pf.Link, pf.Para, pf.Plain, pf.Quoted, pf.SmallCaps, pf.Span, pf.Strikeout, pf.Strong, pf.Subscript, pf.Superscript, ) panflute_block_containers = ( pf.BlockQuote, pf.Definition, pf.Div, pf.Doc, pf.ListItem, pf.Note, pf.TableCell, ) if issubclass(element, pf.Inline): return panflute_inline_containers elif issubclass(element, pf.Block): return panflute_block_containers raise TypeError("not Inline or Block: {}".format(element))
[docs]def get_pf_content_attr(container, target): panflute_inline_containers = [ pf.Cite, pf.Emph, pf.Header, pf.Image, pf.LineItem, pf.Link, pf.Para, pf.Plain, pf.Quoted, pf.SmallCaps, pf.Span, pf.Strikeout, pf.Strong, pf.Subscript, pf.Superscript, pf.Table, pf.DefinitionItem, ] panflute_block_containers = ( pf.BlockQuote, pf.Definition, pf.Div, pf.Doc, pf.ListItem, pf.Note, pf.TableCell, ) if issubclass(target, pf.Cite): # we assume a Cite can't contain another Cite if not isinstance(container, tuple(panflute_inline_containers[1:])): return False if issubclass(target, pf.Inline): if isinstance(container, tuple(panflute_inline_containers)): if isinstance(container, pf.Table): return "caption" elif isinstance(container, pf.DefinitionItem): return "term" else: return "content" else: return False if issubclass(target, pf.Block): if isinstance(container, tuple(panflute_block_containers)): return "content" else: return False raise TypeError("target not Inline or Block: {}".format(target))