tangling/translating code

tangling code, as presented by donald knuth, converts a document language into a programming language. the original implementation converts ".WEB" files to valid pascal - ".PAS" - files. the pidgy approach begins with [markdown] text that converts to IPython. [Markdown]: # [Python]: #

    import typing, IPython, pidgy.util, ast, textwrap, markdown_it

tangling pidgy uses block level lexical analysis to separate non-code and code lines of code in an input; pidgy does not take any opinion on inline level markdown syntax. the PythonRender uses the markdown_it module for parsing markdown; past versions of pidgy have tried #pandoc, mistune, and mistletoe. markdown_it is the preferred parser because it provides line numbers for markdown tokens.

    class Tangle(pidgy.compat.markdown.Markdown):
        def __init__(self, *args, **kwargs):
            kwargs['renderer_cls'] = kwargs.get('renderer_cls', PythonRender)
            super().__init__(*args, **kwargs)
            [self.block.ruler.before(
                "code",
                "front_matter",
                __import__('functools').partial(pidgy.util.frontMatter, x),
                {"alt": ["paragraph", "reference", "blockquote", "list"]},
            ) for x in "-+"]
            self.block.ruler.before(
                "reference", "footnote_def", markdown_it.extensions.footnote.index.footnote_def, {"alt": ["paragraph", "reference"]}
            )
            self.disable('html_block')

the primary goal of the pidgy lexical analysis to separate non-code and code lines when the markdown is pythonified. both indented block code and code fences determine the heuristics for entangling the non-code and code strings. while developing pidgy, we’ve purposefully avoided defining any heuristics for code fenced languages. if author’s prefer they can executed code in pidgy code fences if no language is supplied.

    class Pythonify(pidgy.compat.markdown.Renderer):
        QUOTES = '"""', "'''"
    
        def noncode(self, tokens, idx, env):
            token, range, prior = None, slice(None), slice(*tokens[-1].map)
            if idx < len(tokens):
                token = tokens[idx]
                range, prior = slice(*tokens[idx].map), slice(*tokens[idx-1].map) if idx else slice(0,0)                
            
            non_code = pidgy.util.dedent_block(''.join(env['src'][prior.stop:range.start]))
            non_code = self.indent(self.hanging_indent(non_code, env), env)
            if not env.get('quoted', False):
                non_code = self.quote(non_code, trailing=';' if token is None else '')
            return non_code
        
        def code_block(self, tokens, idx, options, env):
            code = self.noncode(tokens, idx, env) + pidgy.util.quote_docstrings(self.token_to_str(tokens, idx, env))
            return self.update_env(code, tokens, idx, env) or code
        
        def fence(self, tokens, idx, options, env):
            "We'll only recieve fences without a lang."
            code =  self.noncode(tokens, idx, env) + textwrap.indent(
                pidgy.util.quote_docstrings(pidgy.util.unfence(self.token_to_str(tokens, idx, env))), ' '*4
            )
            return self.update_env(code, tokens, idx, env) or code

                
        def update_env(self, code, tokens, idx, env):
            next = self.get_next_code_token(tokens, idx)
            env.update(base_indent=pidgy.util.trailing_indent(code))

            extra_indent = 0
            if next:
                extra_indent = max(0, pidgy.util.lead_indent(env['src'][slice(*next.map)]) -env['base_indent'])
            if not extra_indent and code.rstrip().endswith(":"):
                extra_indent += 4
            rstrip = code.rstrip()
            env.update(
                extra_indent=extra_indent,
                continued=rstrip.endswith('\\'), 
                quoted=rstrip.rstrip('\\').endswith(self.QUOTES)
            )

pidgy includes special affordances affordances for common notation like front matter, footnotes as annotations, and bulleted lists.

    class PythonRender(Pythonify):
        def front_matter(self, tokens, idx, options, env):
            token, code = tokens[idx], self.token_to_str(tokens, idx, env)
            if token.markup == '+++':
                code = F'''locals().update(__import__('toml').loads("""{code}""".partition('+++')[2].rpartition('+++')[0]))\n'''
            elif token.markup == '---':
                code = F'''locals().update(__import__('ruamel.yaml').yaml.safe_load("""{code}""".partition('---')[2].rpartition('---')[0]))\n'''            
            return self.indent(code, env)

            
        def reference(self, tokens, idx, options, env, *, re='link_item'):
            token, code = tokens[idx], self.token_to_str(tokens, idx, env)
            if env['quoted']:
                return code
            
            expr  = "{"+F"""x.group(1): x.group(2).rstrip() for x in __import__('pidgy').util.{re}.finditer({
                self.quote(textwrap.dedent(code), trailing=")}").rstrip()
            }"""
            if not env['continued']:
                expr = """locals()["__annotations__"] = {**%s, **locals().get('__annotations__', {})}"""%expr
            code = self.noncode(tokens, idx, env) + self.indent(expr + "\n", env)
            return code
        
        def footnote_reference_open(self, tokens, idx, options, env):
            return self.reference(tokens, idx, options, env, re='footnote_item')
        
        def bullet_list_open(self, tokens, idx, options, env):
            token, code = tokens[idx], self.token_to_str(tokens, idx, env)
            if env['quoted']:
                return code
            if env['continued']:
                return self.indent(
                    (F"""[x.group().rstrip().partition(' ')[2] for x in __import__('pidgy').util.list_item.finditer({
                        self.quote(textwrap.dedent(code), trailing=')]')
                    }\n"""), env)
            code = self.quote(textwrap.dedent(code), trailing=';')
            code = self.indent(self.hanging_indent(code, env), env)
            return code

        ordered_list_open = bullet_list_open 

tangle is a public function for tangling markdown to python.

    def tangle(str:str)->str:
        translate = Tangle()
        return translate.render(''.join(str or []))

pidgy interfaces with IPython as an input transform manager trait.

    class pidgyManager(pidgy.base.Trait, IPython.core.inputtransformer2.TransformerManager):
        def __init__(self, *args, **kwargs):
            super().__init__(*args, **kwargs)
            self.tangle = Tangle()
        def transform_cell(self, cell): 
            if self.enabled:
                cell = self.tangle.render(cell)
            return super(type(self), self).transform_cell(cell)

more langauge features

pidgy experiments extra language features for python, using the same system that IPython uses to add features like line and cell magics.

Recently, IPython introduced a convention that allows top level await statements outside of functions. Building of this convenience, pidgy allows for top-level return and yield statements. These statements are replaced with the an IPython display statement.

    class ExtraSyntax(ast.NodeTransformer):
        def visit_FunctionDef(self, node): return node
        visit_AsyncFunctionDef = visit_FunctionDef        

        def visit_Return(self, node):
            replace = ast.parse('''__import__('IPython').display.display()''').body[0]
            replace.value.args = node.value.elts if isinstance(node.value, ast.Tuple) else [node.value]
            return ast.copy_location(replace, node)

        def visit_Expr(self, node):
            if isinstance(node.value, (ast.Yield, ast.YieldFrom)):  return ast.copy_location(self.visit_Return(node.value), node)
            return node

        visit_Expression = visit_Expr

We know naming is hard, there is no point focusing on it. pidgy allows authors to use emojis as variables in python. They add extra color and expression to the narrative.

    def demojize(lines, delimiters=('_', '_')):
        str = ''.join(lines or [])
        import tokenize, emoji, stringcase; tokens = []
        try:
            for token in list(tokenize.tokenize(
                __import__('io').BytesIO(str.encode()).readline)):
                if token.type == tokenize.ERRORTOKEN:
                    string = emoji.demojize(token.string, delimiters=delimiters
                                           ).replace('-', '_').replace("’", "_")
                    if tokens and tokens[-1].type == tokenize.NAME: tokens[-1] = tokenize.TokenInfo(tokens[-1].type, tokens[-1].string + string, tokens[-1].start, tokens[-1].end, tokens[-1].line)
                    else: tokens.append(
                        tokenize.TokenInfo(
                            tokenize.NAME, string, token.start, token.end, token.line))
                else: tokens.append(token)
            return tokenize.untokenize(tokens).decode()
        except BaseException: ...
        return ''.join(lines)