zanders-php-markdown/test.py


import re;

from itertools import product

def parse_colinfo(linesep):
    column_info = [];

    linesep = linesep.strip();

    print(f"linesep = {linesep}");

    if linesep.startswith("|"):
        linesep = linesep[1:]

        print(f"linesep = {linesep}");

    if linesep.endswith("|"):
        linesep = linesep[:-1]

        print(f"linesep = {linesep}");

    column_info = [];

    for sep in linesep.split("|"):
        sep = sep.strip()

        print(f'sep = "{sep}"');

        if re.match(r"^:-+$", sep):
            appendme = {'align': 'left', 'default-tag': 'td'};
        elif re.match(r"^-+:$", sep):
            appendme = {'align': 'right', 'default-tag': 'td'};
        elif re.match(r"^:-+:$", sep):
            appendme = {'align': 'center', 'default-tag': 'td'};
        elif re.match(r"^-+$", sep):
            appendme = {'align': 'default', 'default-tag': 'td'};
        elif re.match(r"^:=+$", sep):
            appendme = {'align': 'left', 'default-tag': 'th'};
        elif re.match(r"^=+:$", sep):
            appendme = {'align': 'right', 'default-tag': 'th'};
        elif re.match(r"^:=+:$", sep):
            appendme = {'align': 'center', 'default-tag': 'th'};
        elif re.match(r"^=+$", sep):
            appendme = {'align': 'default', 'default-tag': 'th'};
        else:
            appendme = {'align': 'default', 'default-tag': 'td'};

        print(f'appendme = {appendme}');

        column_info.append(appendme);

    return column_info;

class State:
    def __init__(self, section_tag, default_cell_tag, column_info = []):
        self.section_tag = section_tag;

        self.default_cell_tag = default_cell_tag;

        self.column_info = column_info;

        self.already_opened_section = 0;

def do_table_line(state, line):
    """
        I'm supposed to return the whole HTML.
        Including the <tr> if needed.
    """
    print("do_table_line");

    print(f'line = "{line}"');

    tags = r"(?:colgroup|col|thead|tr|tbody|tfoot)"

    passthrough_pattern = fr"(?:<{tags}(?:[\s]+[^>]*)?>|</{tags}>)";

    start_tag_pattern = fr"<(thead|tbody|tfoot)(?:[\s]+[^>]*)?>"

    open_tr_pattern = r"<tr(?:[\s]+[^<>]*)?>[\s]*";

    open_caption_pattern = r"<caption(?:[\s]+[^<>]*)?>";

    cell_delimiter = r"(?:[|]|<(?:tr|th)(?:[\s]+[^<>]*)?>)";

    caption_sentinel_pattern = "(?:" + \
        '|'.join((cell_delimiter, passthrough_pattern, start_tag_pattern,
            open_tr_pattern, open_caption_pattern, r'</?table.*')) + ")"

    already_open_tr = 0;

    out = "";

    while line:
        # Is it whatever our start tag is?
        if (m := re.match(start_tag_pattern, line)):
            # we'll pass this through, and remember that we don't need to do
            # it ourselves also possibly close previous section also change
            # "section_tag" to be thead
            print(f"found our start tag: '{m.group(0)}'");

            # if we're already open, close whatever that was
            if state.already_opened_section:
                out += f'</{state.section_tag}>' + "\n";

            out += m.group(0) + "\n";

            line = line[len(m.group(0)):];

            print(f'line = "{line}"');

            state.section_tag = m.group(1);
            state.already_opened_section = 1;

            continue;

        # Is it a <tr> we should just pass through?
        if (m := re.match(open_tr_pattern, line)):

            # we'll also pass this through, and remember that we don't need
            # to do it ourselves.

            print("found our tr tag");

            if not state.already_opened_section:
                out += f"<{state.section_tag}>";
                state.already_opened_section = 1;

            out += m.group(0) + "\n";

            line = line[len(m.group(0)):];

            print(f'line = "{line}"');

            already_open_tr = 1;

            continue;

        # Is it the "<caption>" table part?
        if (m := re.match(open_caption_pattern, line)):

            print(f'found caption tag: "{m.group(0)}"');

            out += m.group(0);

            line = line[len(m.group(0)):];

            print(f'line = "{line}"');

            # For the caption, we'll need to keep eating until we see
            # the caption sentinel, or end-of-line

            while line:
                if (m := re.match(r"</caption>", line)):
                    line = line[len(m.group(0)):];

                    print(f'line = "{line}"');

                    break;

                if (m := re.match(caption_sentinel_pattern, line)):
                    break;

                out += line[0];

                line = line[1:];

                print(f'line = "{line}"');

            out += "</caption>" + "\n";

            continue;

        # Is it some table part we should just pass through?
        if (m := re.match(passthrough_pattern, line)):
            print(f'found passthrough tag: "{m.group(0)}"');

            out += m.group(0) + "\n";

            line = line[len(m.group(0)):];

            print(f'line = "{line}"');

            continue;

        # If we made it here, it must actually be content
        break;

    if line:
        # If there's anything to read after the table parts, it's gotta be cells

        print("new table row");

        column_index = 0;

        if not state.already_opened_section:
            out += f"<{state.section_tag}>" + "\n";
            state.already_opened_section = 1;

        if not already_open_tr:
            out += "<tr>" + "\n";

        while line:
            print("new cell");

            tag = "";

            print("loading attributes");

            attributes = {};

            if column_index < len(state.column_info):
                align = state.column_info[column_index]['align'];

                if align != 'default':
                    attributes['style'] = f'text-align: {align}';

                    print(f'attributes = "{attributes}"');

            print("looking for starter");

            if (m := re.match(r"[|]", line)):
                print("found opening vbar");

                if column_index < len(state.column_info):
                    tag = state.column_info[column_index]['default-tag']
                else:
                    tag = state.default_cell_tag;

                print(f'tag = "{tag}"');

                line = line[1:];

                print(f'line = "{line}"');
            elif (m := re.match(r"<(th|td)(?:[\s]+([^<>]*))?>", line)):
                print("found opening HTML tag");

                tag = m.group(1);

                print(f'tag = "{tag}"');

                raw_attributes = m.group(2);

                while raw_attributes:
                    if (mm := re.match(fr'\s*([a-z0-9]+)\s*=\s*"([^"]+)"', raw_attributes)):
                        attribute = mm.group(1);

                        value = mm.group(2);

                        print(f'attribute = "{attribute}"');

                        print(f'value = "{value}"');

                        if attribute in attributes and attribute == "style":
                            attributes[attribute] += '; ' + value;
                        else:
                            attributes[attribute] = value;

                        print(f'attributes = "{attributes}"');

                        raw_attributes = raw_attributes[len(mm.group(0)):];

                        print(f'raw_attributes = "{raw_attributes}"');
                    else:
                        print(f'could not parse HTML attributes?! huh?! throwing! ');

                        raise SyntaxError("could not parse HTML attributes");

                line = line[len(m.group(0)):];

                print(f'line = "{line}"');
            elif (m := re.match(r"<(th|td)", line)):
                print("found HTML open, but it's incomplete? huh?! throwing!");

                raise SyntaxError("could not find '>' for HTML open tag");
            elif column_index < len(state.column_info):
                print("found nothing, defaulting to column info");

                tag = state.column_info[column_index]['default-tag']

                print(f'tag = "{tag}"');

                print(f'line = "{line}"');
            else:
                print(f"found nothing, defaulting default_tag ({state.default_cell_tag})");

                tag = state.default_cell_tag;

                print(f'tag = "{tag}"');

                print(f'line = "{line}"');

            print("looking for closer");

            content = "";

            depth = 0;

            while line:
                if (m := re.match("(?:\\s|\\w|[.,-])+", line)):
                    content += m.group(0);
                    line = line[len(m.group(0)):];
                    print(f'line = "{line}"');
                elif not depth and (m := re.match("<(td|th)", line)):
                    print(f"found HTML open tag: {m.group(1)}");
                    break;
                elif not depth and (m := re.match("</(td|th)>", line)):
                    print(f"found HTML close tag: {m.group(1)}");
                    line = line[len(m.group(0)):]
                    print(f'line = "{line}"');
                    break;
                elif not depth and (m := re.match(r"([|]+)", line)):
                    print(f'found vbar ("{m.group(1)}")');

                    num_vbars = len(m.group(1));

                    if num_vbars > 1:
                        attributes['colspan'] = num_vbars;

                    line = line[len(m.group(0)):]

                    print(f'line = "{line}"');
                    break;
                elif (m := re.match("<table>", line)):
                    content += m.group(0);
                    depth += 1;
                    line = line[len(m.group(0)):]
                    print(f'line = "{line}"');
                    print(f'depth = {depth}');
                elif (m := re.match("</table>", line)):
                    content += m.group(0);
                    depth -= 1;
                    if depth < 0: raise SyntaxError("negative depth; bad HTML");
                    line = line[len(m.group(0)):]
                    print(f'line = "{line}"');
                    print(f'depth = {depth}');
                elif (m := re.match("<[a-z]+(?:\\s+[^<>]*)?>", line)):
                    content += m.group(0);
                    line = line[len(m.group(0)):]
                    print(f'line = "{line}"');
                elif (m := re.match("</[a-z]+>", line)):
                    content += m.group(0);
                    line = line[len(m.group(0)):]
                    print(f'line = "{line}"');
                elif (m := re.match(r"`[^`\n]+`", line)):
                    content += m.group(0);
                    line = line[len(m.group(0)):]
                    print(f'line = "{line}"');
                else:
                    print(f'line = "{line}"');

                    assert(not "TODO");

            if attributes:
                attributes = " ".join(f'{key}="{value}"'
                    for key, value in sorted(attributes.items()));

                cell = f"<{tag} {attributes}> {content} </{tag}>";
            else:
                cell = f"<{tag}> {content} </{tag}>";

            print(f'cell = "{cell}"');

            out += cell;

            column_index += 1;

        # end the row of content
        out += "\n";

        # close tr on its own line
        out += "</tr>" + "\n";

    print(f'out = "{out}"');

    return out;

# def do_table(table_open_tag, header_lines, seperator_line, body_lines, optional_caption):
#
#     # handle explicit table tag?
#     if table_open_tag:
#         open_tag = table_open_tag + "\n";
#     else:
#         # otherwise, add a default one:
#         open_tag = "<table>" + "\n";
#
#     inner = "";
#
#     state = State(section_tag = "thead", default_cell_tag = "th");
#
#     # Process the header lines:
#     for line in header_lines:
#         inner = do_table_line(state, line);
#
#     if state.already_opened_section:
#         inner += f"</{state.section_tag}>" "\n";
#
#     # Handle line seperator:
#     column_info = parse_colinfo(seperator_line);
#
#     # Process the body lines:
#     for lines in body_lines:
#         state = State(section_tag = "tbody", \
#                       default_cell_tag = "td", \
#                       column_info = column_info);
#
#         for line in lines:
#             inner += do_table_line(state, line);
#
#         if state.already_opened_section:
#             inner += f"</{state.section_tag}>" "\n";
#
#     # Consider the optional caption.
#     # If it happens, it goes before everything else
#     if optional_caption:
#         inner = f"<caption> {optional_caption} </caption>\n" + inner;
#
#     close_tag = "</table>\n";
#
#     for o in inner.split("\n"):
#         print(o);
#
#     return "\n\n" + open_tag + inner + close_tag + "\n\n";

def handle_table(m):
    print("handle_table");

    matched = m.group(0);

    optional_table_open = m.group(1);

    one_or_more_header_lines = m.group(2);

    header_lines = one_or_more_header_lines.strip().split("\n")

    seperator_line = m.group(3);

    one_or_more_body_lines = m.group(4);

    body_lines = [e.strip().split("\n") for e in one_or_more_body_lines.strip().split("\n\n")]

    optional_caption = m.group(5);

    assert(seperator_line is not None)

    try:
        # handle explicit table tag?
        if optional_table_open:
            open_tag = optional_table_open + "\n";
        else:
            # otherwise, add a default one:
            open_tag = "<table>" + "\n";

        inner = "";

        state = State(section_tag = "thead", default_cell_tag = "th");

        # Process the header lines:
        for line in header_lines:
            inner = do_table_line(state, line);

        if state.already_opened_section:
            inner += f"</{state.section_tag}>" "\n";

        # Handle line seperator:
        column_info = parse_colinfo(seperator_line);

        # Process the body lines:
        for lines in body_lines:
            state = State(section_tag = "tbody", \
                          default_cell_tag = "td", \
                          column_info = column_info);

            for line in lines:
                inner += do_table_line(state, line);

            if state.already_opened_section:
                inner += f"</{state.section_tag}>" "\n";

        # Consider the optional caption.
        # If it happens, it goes before everything else
        if optional_caption:
            inner = f"<caption> {optional_caption} </caption>\n" + inner;

        close_tag = "</table>\n";

        return "\n\n" + open_tag + inner + close_tag + "\n\n";
    except SyntaxError as e:
        print(f"caught syntax error: {e}");
        print("moving on to next table...");
        return m.group(0);

def handle_table_no_sep(m):
    print("handle_table_no_sep");

    matched = m.group(0);

    print(f'matched = """{matched}"""');

    table_open_tag = m.group(1) + "\n";

    one_or_more_body_lines = m.group(2);

    body_lines = [e.strip().split("\n") for e in one_or_more_body_lines.strip().split("\n\n")]

    optional_caption = m.group(3);

    try:
        inner = "";

        # Process the body lines:
        for lines in body_lines:
            state = State(section_tag = "tbody", \
                          default_cell_tag = "td", \
                          column_info = []);

            for line in lines:
                inner += do_table_line(state, line);

            if state.already_opened_section:
                inner += f"</{state.section_tag}>" "\n";

        # Consider the optional caption.
        # If it happens, it goes before everything else
        if optional_caption:
            inner = f"<caption> {optional_caption} </caption>\n" + inner;

        table_close_tag = "</table>\n";

        return "\n\n" + table_open_tag + inner + table_close_tag + "\n\n";

    except SyntaxError as e:
        print(f"caught syntax error: {e}");
        print("moving on to next table...");
        return m.group(0);

with open("test.md") as stream:
    text = stream.read();

# delimiters between cells
delimiter = r"(?:[|]|<(?:tr|th)(?:[\s]+[^<>]*)?>)";

# A row is anything with at least one delimiter
row = fr"(?: .* {delimiter} .*)";

# Most table parts are simple.
table_tags = r"(?:colgroup|col|thead|tr|tbody|tfoot|caption)"
table_part = fr"(?:<{table_tags}(?:\s+[^<>]*)?>|</{table_tags}>)";

# captions eat until the next table part or delimiter
# temp = fr"(?: {table_part} | <caption>)";
# caption_table_part = fr"(?: <caption> (?! (?: {delimiter} | {temp})) .*)"
# caption_table_part = fr"(?: <caption> (?!{delimiter}) .*)"
# caption_table_part = fr"(?: <caption> [^<>]* </caption>)"
caption_table_part = fr"(?: <caption> [^<>]*)"
table_part = fr"(?: {table_part} | {caption_table_part})";

# A table line can also be all table parts
row = fr"(?:{row} | {table_part}+)";

# Between the header rows and the body rows there is a line seperator.
seperator_line = r"\s* [|]? \s* [-=:]+ \s* (?: \s* [|] \s* [-=:]* \s* )* \s*"

# Regex for whole table:
for o, c in product((1, 0), repeat=2):
    table = fr"""
        # two blank lines:
        [\n]{{2}}

        # optional or required open table tag:
        (?:(<table[\s]+[^<>]*markdown=(?:"1"|'1'|1)[^<>]*>) \n){{{o},1}}

        # zero or one or more header rows:
        ((?: {row} \n){{{1-o},}})

        # line seperator:
        ({seperator_line}) [\n]

        # zero or one or more body rows, with empty lines of one:
        ((?: {row} [\n]{{1,2}}){{{1-c},}})

        # optional caption:
        (?: \[ ([a-z0-9 "']+) \] \n)?

        # optional or required close table tag:
        (?: </table> [\n]){{{c},1}}

        # two blank lines:
        [\n]{{2}}
    """;

    text = re.sub(table, handle_table, text, flags=re.VERBOSE)

table = fr"""
    # two blank lines:
    [\n]{{2}}

    # required open table tag:
    (?:(<table[\s]+[^<>]*markdown=(?:"1"|'1'|1)[^<>]*>) \n)

    # one or more body rows, with empty lines of one:
    ((?: {row} [\n]{{1,2}})+)

    # optional caption:
    (?: \[ ([a-z0-9 "']+) \] \n)?

    # required close table tag:
    (?: </table> [\n])

    # two blank lines:
    [\n]{{2}}
""";

text = re.sub(table, handle_table_no_sep, text, flags=re.VERBOSE)

text += """
<style>
    table
    {
        border-collapse: collapse;
        margin: 1em 0;
    }

    th, td
    {
        border: thin solid grey;
        padding: 0.5em;
    }
</style>

""";

with open("test.html", "w") as stream:
    stream.write(text);