Source code for ipypublish.filters_pandoc.prepare_raw

""" a panflute filter to find raw elements
and convert them to format agnostic Span elements
"""
import re
from typing import Union  # noqa: F401
from panflute import Element, Doc, Cite, RawInline, Link  # noqa: F401
import panflute as pf

from ipypublish.filters_pandoc.definitions import (
    ATTRIBUTE_CITE_CLASS,
    PREFIX_MAP,
    PREFIX_MAP_LATEX_R,
    PREFIX_MAP_RST_R,
    RST_KNOWN_ROLES,
    RAWSPAN_CLASS,
    RAWDIV_CLASS,
    CONVERTED_CITE_CLASS,
    CONVERTED_OTHER_CLASS,
    CONVERTED_DIRECTIVE_CLASS,
)
from ipypublish.filters_pandoc.utils import get_panflute_containers, get_pf_content_attr


[docs]def create_cite_span(identifiers, rawformat, is_block, prefix="", alt=None): """create a cite element from an identifier """ citations = [pf.Citation(identifier) for identifier in identifiers] pmapping = dict(dict(PREFIX_MAP)[prefix]) classes = list(pmapping["classes"]) classes += [RAWSPAN_CLASS, CONVERTED_CITE_CLASS, ATTRIBUTE_CITE_CLASS] attributes = dict(pmapping["attributes"]) attributes["raw-format"] = rawformat if alt is not None: attributes["alt"] = str(alt) cite = Cite(citations=citations) span = pf.Span(cite, classes=classes, attributes=attributes) if is_block: return pf.Plain(span) else: return span
[docs]def process_html_cites(container, doc): # type: (pf.Block, Doc) -> Element """extract raw html <cite data-cite="cite_key">text</cite>""" # if not (isinstance(block, get_panflute_containers(pf.RawInline)) # or isinstance(block, get_panflute_containers(pf.RawBlock))): # return None content_attr = get_pf_content_attr(container, pf.RawInline) if not content_attr: content_attr = get_pf_content_attr(container, pf.RawBlock) if not content_attr: return None initial_content = getattr(container, content_attr) if not initial_content: return None new_content = [] skip = 0 for element in initial_content: if skip > 0: skip = skip - 1 continue if not ( isinstance(element, (pf.RawInline, pf.RawBlock)) and element.format in ("html", "html4", "html5") ): new_content.append(element) continue match = re.match(r"<cite\s*data-cite\s*=\"?([^>\"]*)\"?>", element.text) if not match: new_content.append(element) continue # look for the closing tag span_content = [] closing = element.next while closing: if isinstance(closing, pf.RawInline) and closing.format in ( "html", "html5", ): endmatch = re.match(r"^\s*</cite>\s*$", closing.text) if endmatch: break span_content.append(closing) closing = closing.next if not closing: new_content.append(element) continue # TODO include original content new_content.append( create_cite_span([match.group(1)], "html", isinstance(element, pf.RawBlock)) ) skip = len(span_content) + 1 setattr(container, content_attr, new_content) return container
[docs]def process_latex_raw(element, doc): # type: (Union[pf.RawInline, pf.RawBlock], pf.Doc) -> pf.Element """extract all latex adhering to \\tag{content} or \\tag[options]{content} to a Span element with class RAWSPAN_CLASS attributes: :: attributes={"format": "latex", "tag": tag, "content": content, "options": options} - Cref, cref, ref, and cite will aslo have class CONVERTED_CITE_CLASS - everything else will also have class CONVERTED_OTHER_CLASS """ if not ( isinstance(element, (pf.RawInline, pf.RawBlock)) and element.format in ("tex", "latex") ): return None return assess_latex(element.text, isinstance(element, pf.RawBlock))
[docs]def process_latex_str(block, doc): # type: (pf.Block, Doc) -> Union[pf.Block,None] """see process_latex_raw same but sometimes pandoc doesn't convert to a raw element """ # TODO why is pandoc sometimes converting latex tags to Str? # >> echo "\cite{a}" | pandoc -f markdown -t json # {"blocks":[{"t":"Para","c":[{"t":"RawInline","c":["tex","\\cite{a}"]}]}],"pandoc-api-version":[1,17,5,4],"meta":{}} content_attr = get_pf_content_attr(block, pf.Str) if not content_attr: return None initial_content = getattr(block, content_attr) if not initial_content: return None new_content = [] for element in initial_content: if not isinstance(element, pf.Str): new_content.append(element) continue for string in re.split( r"(\\[^\{\[]+\{[^\}]+\}|\\[^\{\[]+\[[^\]]*\]\{[^\}]+\})", element.text ): if not string: continue new_element = assess_latex(string, False) if new_element is None: new_content.append(pf.Str(string)) else: new_content.append(assess_latex(string, False)) setattr(block, content_attr, new_content) return block
[docs]def assess_latex(text, is_block): """ test if text is a latex command ``\\tag{content}`` or ``\\tag[options]{content}`` if so return a panflute.Span, with attributes: - format: "latex" - tag: <tag> - options: <options> - content: <content> - original: <full text> """ # TODO these regexes do not match labels containing nested {} braces # use recursive regexes (https://stackoverflow.com/a/26386070/5033292) # with https://pypi.org/project/regex/ # find tags with no option, i.e \tag{label} match_latex_noopts = re.match(r"^\s*\\([^\{\[]+)\{([^\}]+)\}\s*$", text) if match_latex_noopts: tag = match_latex_noopts.group(1) content = match_latex_noopts.group(2) if tag in dict(PREFIX_MAP_LATEX_R): new_element = create_cite_span( content.split(","), "latex", is_block, prefix=dict(PREFIX_MAP_LATEX_R).get(tag, ""), ) return new_element span = pf.Span( classes=[RAWSPAN_CLASS, CONVERTED_OTHER_CLASS], attributes={ "format": "latex", "tag": tag, "content": content, "original": text, }, ) if is_block: return pf.Plain(span) else: return span # find tags with option, i.e \tag[options]{label} match_latex_wopts = re.match(r"^\s*\\([^\{\[]+)\[([^\]]*)\]\{([^\}]+)\}\s*$", text) if match_latex_wopts: tag = match_latex_wopts.group(1) options = match_latex_wopts.group(2) content = match_latex_wopts.group(3) span = pf.Span( classes=[RAWSPAN_CLASS, CONVERTED_OTHER_CLASS], attributes={ "format": "latex", "tag": tag, "content": content, "options": options, "original": text, }, ) if is_block: return pf.Plain(span) else: return span return None
[docs]def process_rst_roles(block, doc): # type: (pf.Block, Doc) -> Union[pf.Block,None] """extract rst adhering to ``:role:`label```, where role is a known to a Cite element with class RAWSPAN_CLASS and CONVERTED_CITE_CLASS and attributes: :: attributes={"format": "rst", "role": tag, "content": content} """ # "a :ref:`label` b" is converted to: # (Str(a) Space Str(:ref:) Code(label) Space Str(b)) # if not (isinstance(block, get_panflute_containers(pf.Str))): # return None content_attr = get_pf_content_attr(block, pf.Str) if not content_attr: return None initial_content = getattr(block, content_attr) if not initial_content: return None # match_rst_role = re.match( # "^\\s*\\:([a-z]+)\\:\\`([^\\`]+)\\`$", element.text) new_content = [] skip_next = False for element in initial_content: if skip_next: skip_next = False continue if not (isinstance(element, pf.Str) and isinstance(element.next, pf.Code)): new_content.append(element) continue if not ( len(element.text) > 2 and element.text.startswith(":") and element.text.endswith(":") ): new_content.append(element) continue role = element.text[1:-1] content = element.next.text if role in dict(PREFIX_MAP_RST_R): new_element = create_cite_span( content.split(","), "rst", False, prefix=dict(PREFIX_MAP_RST_R).get(role, ""), ) new_content.append(new_element) skip_next = True elif role in RST_KNOWN_ROLES: new_element = pf.Span( classes=[RAWSPAN_CLASS, CONVERTED_OTHER_CLASS], attributes={ "format": "rst", "role": role, "content": content, "original": "{0}`{1}`".format(element.text, element.next.text), }, ) new_content.append(new_element) skip_next = True else: new_content.append(element) # if len(new_content) != len(block.content): # block.content = new_content # return block setattr(block, content_attr, new_content) return block
[docs]def gather_processors(element, doc): """ we gather the processors, so that we don't have to do multiple passes """ # apply processors that change one elements new_element = process_internal_links(element, doc) if new_element is not None: return new_element new_element = process_latex_raw(element, doc) if new_element is not None: return new_element # apply processors that change multiple inline elements in a block if isinstance(element, get_panflute_containers(pf.Inline)) or isinstance( pf.Table, pf.DefinitionItem ): new_element = process_html_cites(element, doc) if new_element is not None: element = new_element new_element = process_latex_str(element, doc) if new_element is not None: element = new_element new_element = process_rst_roles(element, doc) if new_element is not None: element = new_element # apply processors that change multiple block elements if isinstance(element, get_panflute_containers(pf.Block)): new_element = process_html_cites(element, doc) if new_element is not None: element = new_element return element
[docs]def wrap_rst_directives(doc): """search for rst directives and wrap them in divs with top line starting ``Str(..)Space()Str(name::)``, above a CodeBlock, and rst labels of the form ``Str(..)Space()Str(_name:)`` """ final_blocks = [] skip_next = False for block in doc.content: if skip_next: skip_next = False continue if not isinstance(block, pf.Para): final_blocks.append(block) continue if len(block.content) < 3: final_blocks.append(block) continue if ( isinstance(block.content[0], pf.Str) and block.content[0].text == ".." and isinstance(block.content[1], pf.Space) and isinstance(block.content[2], pf.Str) ): if ( len(block.content) == 3 and block.content[2].text.startswith("_") and block.content[2].text.endswith(":") ): # the block is an rst label new_block = pf.Div( block, classes=[RAWDIV_CLASS, CONVERTED_OTHER_CLASS], attributes={"format": "rst"}, ) final_blocks.append(new_block) continue if block.content[2].text.endswith("::") and isinstance( block.next, pf.CodeBlock ): # the block is a directive with body content # TODO at present we allow any directive name # the block may contain option directives, e.g. :width: skip_next = True inline_arg = "" if len(block.content) > 3: inline_content = [] for el in block.content[3:]: if isinstance(el, pf.SoftBreak): break inline_content.append(el) if inline_content: inline_arg = ( pf.stringify(pf.Para(*inline_content)) .replace("\n", "") .strip() ) new_block = pf.Div( block, *pf.convert_text(block.next.text), classes=[RAWDIV_CLASS, CONVERTED_DIRECTIVE_CLASS], attributes={ "format": "rst", "directive": block.content[2].text[:-2], "inline": inline_arg, "has_body": True, } ) final_blocks.append(new_block) continue if block.content[2].text.endswith("::"): # the block is a directive without body content # TODO at present we allow any directive name # the block may contain option directives, e.g. :width: inline_arg = "" if len(block.content) > 3: inline_content = [] for el in block.content[3:]: if isinstance(el, pf.SoftBreak): break inline_content.append(el) if inline_content: inline_arg = ( pf.stringify(pf.Para(*inline_content)) .replace("\n", "") .strip() ) new_block = pf.Div( block, classes=[RAWDIV_CLASS, CONVERTED_DIRECTIVE_CLASS], attributes={ "format": "rst", "directive": block.content[2].text[:-2], "inline": inline_arg, "has_body": False, }, ) final_blocks.append(new_block) continue final_blocks.append(block) doc.content = final_blocks
[docs]def prepare(doc): # type: (Doc) -> None wrap_rst_directives(doc)
[docs]def finalize(doc): # type: (Doc) -> None pass
[docs]def main(doc=None, extract_formats=True): # type: (Doc, bool) -> None """if extract_formats then convert citations defined in latex, rst or html formats to special Span elements """ return pf.run_filter(gather_processors, prepare, finalize, doc=doc)
if __name__ == "__main__": main()