Translating Markdown to Python¶
A primary translation is literate programming is the tangle step that converts the literate program into the programming language. The 1979 implementation converts ".WEB"
files to valid pascal - ".PAS"
- files. The pidgy
approach begins with Markdown files and proper Python files as the outcome. The rest of this document configures how [IPython] acknowledges the transformation and the heuristics the translate Markdown to Python.
[1]:
import typing, mistune, IPython, pidgy.util
__all__ = 'tangle', 'Tangle'
The pidgy
tangle workflow has three steps:
- Block-level lexical analysis to tokenize Markdown.
- Normalize the tokens to compacted
"code" and not "code"
tokens. - Translate the normalized tokens to a string of valid Python code.
[2]:
@pidgy.implementation
def tangle(str:str)->str:
translate = Tangle()
return translate.stringify(translate.parse(''.join(str)))
[3]:
class pidgyManager(IPython.core.inputtransformer2.TransformerManager):
def transform_cell(self, cell): return super(type(self), self).transform_cell(tangle(str=cell))
Block level lexical analysis.¶
pidgy
uses a modified mistune.BlockLexer
to create block level tokens for a [Markdown] source. A specific pidgy
addition is the addition off a doctest
block object, doctest
are testable strings that are ignored by the tangle step. The tokens are to be normalized and translated to [Python] strings.
BlockLexer
[4]:
class BlockLexer(mistune.BlockLexer, pidgy.util.ContextDepth):
class grammar_class(mistune.BlockGrammar):
doctest = __import__('doctest').DocTestParser._EXAMPLE_RE
block_code = __import__('re').compile(r'^((?!\s+>>>\s) {4}[^\n]+\n*)+')
default_rules = "newline hrule block_code fences heading nptable lheading block_quote list_block def_links def_footnotes table paragraph text".split()
def parse_doctest(self, m): self.tokens.append({'type': 'paragraph', 'text': m.group(0)})
def parse_fences(self, m):
if m.group(2): self.tokens.append({'type': 'paragraph', 'text': m.group(0)})
else: super().parse_fences(m)
def parse_hrule(self, m): self.tokens.append(dict(type='hrule', text=m.group(0)))
def parse_def_links(self, m):
super().parse_def_links(m)
self.tokens.append(dict(type='def_link', text=m.group(0)))
def parse_front_matter(self): ...
def parse(self, text: str, default_rules=None, normalize=True) -> typing.List[dict]:
front_matter = None
if not self.depth:
self.tokens = []
if text.strip() and text.startswith('---\n') and '\n---\n' in text[4:]:
front_matter, sep, text = text[4:].partition('---\n')
front_matter = {'type': 'front_matter', 'text': F"\n{front_matter}"}
with self: tokens = super().parse(pidgy.util.whiten(text), default_rules)
if normalize and not self.depth: tokens = normalizer(text, tokens)
if front_matter: tokens.insert(0, front_matter)
return tokens
Normalizing the tokens¶
Tokenizing [Markdown] typically extracts conventions at both the block and inline level. Fortunately, pidgy
’s translation is restricted to block level [Markdown] tokens, and mitigating some potential complexities from having opinions about inline code while tangling.
normalizer
[5]:
def normalizer(text, tokens):
compacted = []
while tokens:
token = tokens.pop(0)
if 'text' not in token: continue
if not token['text'].strip(): continue
block, body = token['text'].splitlines(), ""
while block:
line = block.pop(0)
if line:
before, line, text = text.partition(line)
body += before + line
if token['type']=='code':
compacted.append({'type': 'code', 'lang': None, 'text': body})
elif compacted and compacted[-1]['type'] == 'paragraph':
compacted[-1]['text'] += body
else: compacted.append({'type': 'paragraph', 'text': body})
if compacted and compacted[-1]['type'] == 'paragraph':
compacted[-1]['text'] += text
elif text.strip():
compacted.append({'type': 'paragraph', 'text': text})
# Deal with front matter
if compacted and compacted[0]['text'].startswith('---\n') and '\n---' in compacted[0]['text'][4:]:
token = compacted.pop(0)
front_matter, sep, paragraph = token['text'][4:].partition('---')
compacted = [{'type': 'front_matter', 'text': F"\n{front_matter}"},
{'type': 'paragraph', 'text': paragraph}] + compacted
return compacted
Flattening the tokens to a [Python] string.¶
The tokenizer controls the translation of markdown strings to python strings. Our major constraint is that the Markdown input should retain line numbers.
Flatten
[6]:
class Tangle(BlockLexer):
def stringify(self, tokens: typing.List[dict], source: str = """""", last: int =0) -> str:
import textwrap
INDENT = indent = pidgy.util.base_indent(tokens) or 4
for i, token in enumerate(tokens):
object = token['text']
if token and token['type'] == 'code':
if object.lstrip().startswith(pidgy.util.FENCE):
object = ''.join(''.join(object.partition(pidgy.util.FENCE)[::2]).rpartition(pidgy.util.FENCE)[::2])
indent = INDENT + pidgy.util.num_first_indent(object)
object = textwrap.indent(object, INDENT*pidgy.util.SPACE)
if object.lstrip().startswith(pidgy.util.MAGIC): ...
else: indent = pidgy.util.num_last_indent(object)
elif token and token['type'] == 'front_matter':
object = textwrap.indent(
F"locals().update(__import__('ruamel.yaml').yaml.safe_load({pidgy.util.quote(object)}))\n", indent*pidgy.util.SPACE)
elif not object: ...
else:
object = textwrap.indent(object, pidgy.util.SPACE*max(indent-pidgy.util.num_first_indent(object), 0))
for next in tokens[i+1:]:
if next['type'] == 'code':
next = pidgy.util.num_first_indent(next['text'])
break
else: next = indent
Δ = max(next-indent, 0)
if not Δ and source.rstrip().rstrip(pidgy.util.CONTINUATION).endswith(pidgy.util.COLON):
Δ += 4
spaces = pidgy.util.indents(object)
"what if the spaces are ling enough"
object = object[:spaces] + Δ*pidgy.util.SPACE+ object[spaces:]
if not source.rstrip().rstrip(pidgy.util.CONTINUATION).endswith(pidgy.util.QUOTES):
object = pidgy.util.quote(object)
source += object
# add a semicolon to the source if the last block is code.
for token in reversed(tokens):
if token['text'].strip():
if token['type'] != 'code':
source = source.rstrip() + pidgy.util.SEMI
break
return source
Append the lexer for nested rules.
[7]:
for x in "default_rules footnote_rules list_rules".split():
setattr(BlockLexer, x, list(getattr(BlockLexer, x)))
getattr(BlockLexer, x).insert(getattr(BlockLexer, x).index('block_code'), 'doctest')
if 'block_html' in getattr(BlockLexer, x):
getattr(BlockLexer, x).pop(getattr(BlockLexer, x).index('block_html'))
del x
More pidgy
langauge features¶
pidgy
experiments extra language features for python, using the same system that IPython uses to add features like line and cell magics.
[1]:
import ast, pidgy, IPython
Recently, IPython introduced a convention that allows top level await statements outside of functions. Building of this convenience, pidgy
allows for top-level return and yield statements. These statements are replaced with the an IPython display statement.
[2]:
class ExtraSyntax(ast.NodeTransformer):
def visit_FunctionDef(self, node): return node
visit_AsyncFunctionDef = visit_FunctionDef
def visit_Return(self, node):
replace = ast.parse('''__import__('IPython').display.display()''').body[0]
replace.value.args = node.value.elts if isinstance(node.value, ast.Tuple) else [node.value]
return ast.copy_location(replace, node)
def visit_Expr(self, node):
if isinstance(node.value, (ast.Yield, ast.YieldFrom)): return ast.copy_location(self.visit_Return(node.value), node)
return node
visit_Expression = visit_Expr
We know naming is hard, there is no point focusing on it. pidgy
allows authors to use emojis as variables in python. They add extra color and expression to the narrative.
[3]:
def demojize(lines, delimiters=('_', '_')):
str = ''.join(lines)
import tokenize, emoji, stringcase; tokens = []
try:
for token in list(tokenize.tokenize(
__import__('io').BytesIO(str.encode()).readline)):
if token.type == tokenize.ERRORTOKEN:
string = emoji.demojize(token.string, delimiters=delimiters
).replace('-', '_').replace("’", "_")
if tokens and tokens[-1].type == tokenize.NAME: tokens[-1] = tokenize.TokenInfo(tokens[-1].type, tokens[-1].string + string, tokens[-1].start, tokens[-1].end, tokens[-1].line)
else: tokens.append(
tokenize.TokenInfo(
tokenize.NAME, string, token.start, token.end, token.line))
else: tokens.append(token)
return tokenize.untokenize(tokens).decode()
except BaseException: raise SyntaxError(str)
[ ]:
def init_json():
import builtins
builtins.yes = builtins.true = True
builtins.no = builtins.false = False
builtins.null = None