import re; from itertools import product def parse_colinfo(linesep): column_info = []; linesep = linesep.strip(); print(f"linesep = {linesep}"); if linesep.startswith("|"): linesep = linesep[1:] print(f"linesep = {linesep}"); if linesep.endswith("|"): linesep = linesep[:-1] print(f"linesep = {linesep}"); column_info = []; for sep in linesep.split("|"): sep = sep.strip() print(f'sep = "{sep}"'); if re.match(r"^:-+$", sep): appendme = {'align': 'left', 'default-tag': 'td'}; elif re.match(r"^-+:$", sep): appendme = {'align': 'right', 'default-tag': 'td'}; elif re.match(r"^:-+:$", sep): appendme = {'align': 'center', 'default-tag': 'td'}; elif re.match(r"^-+$", sep): appendme = {'align': 'default', 'default-tag': 'td'}; elif re.match(r"^:=+$", sep): appendme = {'align': 'left', 'default-tag': 'th'}; elif re.match(r"^=+:$", sep): appendme = {'align': 'right', 'default-tag': 'th'}; elif re.match(r"^:=+:$", sep): appendme = {'align': 'center', 'default-tag': 'th'}; elif re.match(r"^=+$", sep): appendme = {'align': 'default', 'default-tag': 'th'}; else: appendme = {'align': 'default', 'default-tag': 'td'}; print(f'appendme = {appendme}'); column_info.append(appendme); return column_info; class State: def __init__(self, section_tag, default_cell_tag, column_info = []): self.section_tag = section_tag; self.default_cell_tag = default_cell_tag; self.column_info = column_info; self.already_opened_section = 0; def do_table_line(state, line): """ I'm supposed to return the whole HTML. Including the if needed. """ print("do_table_line"); print(f'line = "{line}"'); tags = r"(?:colgroup|col|thead|tr|tbody|tfoot)" passthrough_pattern = fr"(?:<{tags}(?:[\s]+[^>]*)?>|)"; start_tag_pattern = fr"<(thead|tbody|tfoot)(?:[\s]+[^>]*)?>" open_tr_pattern = r"]*)?>[\s]*"; open_caption_pattern = r"]*)?>"; cell_delimiter = r"(?:[|]|<(?:tr|th)(?:[\s]+[^<>]*)?>)"; caption_sentinel_pattern = "(?:" + \ '|'.join((cell_delimiter, passthrough_pattern, start_tag_pattern, open_tr_pattern, open_caption_pattern, r'' + "\n"; out += m.group(0) + "\n"; line = line[len(m.group(0)):]; print(f'line = "{line}"'); state.section_tag = m.group(1); state.already_opened_section = 1; continue; # Is it a we should just pass through? if (m := re.match(open_tr_pattern, line)): # we'll also pass this through, and remember that we don't need # to do it ourselves. print("found our tr tag"); if not state.already_opened_section: out += f"<{state.section_tag}>"; state.already_opened_section = 1; out += m.group(0) + "\n"; line = line[len(m.group(0)):]; print(f'line = "{line}"'); already_open_tr = 1; continue; # Is it the "" table part? if (m := re.match(open_caption_pattern, line)): print(f'found caption tag: "{m.group(0)}"'); out += m.group(0); line = line[len(m.group(0)):]; print(f'line = "{line}"'); # For the caption, we'll need to keep eating until we see # the caption sentinel, or end-of-line while line: if (m := re.match(r"", line)): line = line[len(m.group(0)):]; print(f'line = "{line}"'); break; if (m := re.match(caption_sentinel_pattern, line)): break; out += line[0]; line = line[1:]; print(f'line = "{line}"'); out += "" + "\n"; continue; # Is it some table part we should just pass through? if (m := re.match(passthrough_pattern, line)): print(f'found passthrough tag: "{m.group(0)}"'); out += m.group(0) + "\n"; line = line[len(m.group(0)):]; print(f'line = "{line}"'); continue; # If we made it here, it must actually be content break; if line: # If there's anything to read after the table parts, it's gotta be cells print("new table row"); column_index = 0; if not state.already_opened_section: out += f"<{state.section_tag}>" + "\n"; state.already_opened_section = 1; if not already_open_tr: out += "" + "\n"; while line: print("new cell"); tag = ""; print("loading attributes"); attributes = {}; if column_index < len(state.column_info): align = state.column_info[column_index]['align']; if align != 'default': attributes['style'] = f'text-align: {align}'; print(f'attributes = "{attributes}"'); print("looking for starter"); if (m := re.match(r"[|]", line)): print("found opening vbar"); if column_index < len(state.column_info): tag = state.column_info[column_index]['default-tag'] else: tag = state.default_cell_tag; print(f'tag = "{tag}"'); line = line[1:]; print(f'line = "{line}"'); elif (m := re.match(r"<(th|td)(?:[\s]+([^<>]*))?>", line)): print("found opening HTML tag"); tag = m.group(1); print(f'tag = "{tag}"'); raw_attributes = m.group(2); while raw_attributes: if (mm := re.match(fr'\s*([a-z0-9]+)\s*=\s*"([^"]+)"', raw_attributes)): attribute = mm.group(1); value = mm.group(2); print(f'attribute = "{attribute}"'); print(f'value = "{value}"'); if attribute in attributes and attribute == "style": attributes[attribute] += '; ' + value; else: attributes[attribute] = value; print(f'attributes = "{attributes}"'); raw_attributes = raw_attributes[len(mm.group(0)):]; print(f'raw_attributes = "{raw_attributes}"'); else: print(f'could not parse HTML attributes?! huh?! throwing! '); raise SyntaxError("could not parse HTML attributes"); line = line[len(m.group(0)):]; print(f'line = "{line}"'); elif (m := re.match(r"<(th|td)", line)): print("found HTML open, but it's incomplete? huh?! throwing!"); raise SyntaxError("could not find '>' for HTML open tag"); elif column_index < len(state.column_info): print("found nothing, defaulting to column info"); tag = state.column_info[column_index]['default-tag'] print(f'tag = "{tag}"'); print(f'line = "{line}"'); else: print(f"found nothing, defaulting default_tag ({state.default_cell_tag})"); tag = state.default_cell_tag; print(f'tag = "{tag}"'); print(f'line = "{line}"'); print("looking for closer"); content = ""; depth = 0; while line: if (m := re.match("(?:\\s|\\w|[.,-])+", line)): content += m.group(0); line = line[len(m.group(0)):]; print(f'line = "{line}"'); elif not depth and (m := re.match("<(td|th)", line)): print(f"found HTML open tag: {m.group(1)}"); break; elif not depth and (m := re.match("", line)): print(f"found HTML close tag: {m.group(1)}"); line = line[len(m.group(0)):] print(f'line = "{line}"'); break; elif not depth and (m := re.match(r"([|]+)", line)): print(f'found vbar ("{m.group(1)}")'); num_vbars = len(m.group(1)); if num_vbars > 1: attributes['colspan'] = num_vbars; line = line[len(m.group(0)):] print(f'line = "{line}"'); break; elif (m := re.match("", line)): content += m.group(0); depth += 1; line = line[len(m.group(0)):] print(f'line = "{line}"'); print(f'depth = {depth}'); elif (m := re.match("
", line)): content += m.group(0); if depth > 0: depth -= 1; line = line[len(m.group(0)):] print(f'line = "{line}"'); print(f'depth = {depth}'); elif (m := re.match("<[a-z]+(?:\\s+[^<>]*)?>", line)): content += m.group(0); line = line[len(m.group(0)):] print(f'line = "{line}"'); elif (m := re.match("", line)): content += m.group(0); line = line[len(m.group(0)):] print(f'line = "{line}"'); elif (m := re.match(r"`[^`\n]+`", line)): content += m.group(0); line = line[len(m.group(0)):] print(f'line = "{line}"'); else: print(f'line = "{line}"'); assert(not "TODO"); if attributes: attributes = " ".join(f'{key}="{value}"' for key, value in sorted(attributes.items())); cell = f"<{tag} {attributes}> {content} "; else: cell = f"<{tag}> {content} "; print(f'cell = "{cell}"'); out += cell; column_index += 1; # end the row of content out += "\n"; # close tr on its own line out += "" + "\n"; print(f'out = "{out}"'); return out; # def do_table(table_open_tag, header_lines, seperator_line, body_lines, optional_caption): # # # handle explicit table tag? # if table_open_tag: # open_tag = table_open_tag + "\n"; # else: # # otherwise, add a default one: # open_tag = "" + "\n"; # # inner = ""; # # state = State(section_tag = "thead", default_cell_tag = "th"); # # # Process the header lines: # for line in header_lines: # inner = do_table_line(state, line); # # if state.already_opened_section: # inner += f"" "\n"; # # # Handle line seperator: # column_info = parse_colinfo(seperator_line); # # # Process the body lines: # for lines in body_lines: # state = State(section_tag = "tbody", \ # default_cell_tag = "td", \ # column_info = column_info); # # for line in lines: # inner += do_table_line(state, line); # # if state.already_opened_section: # inner += f"" "\n"; # # # Consider the optional caption. # # If it happens, it goes before everything else # if optional_caption: # inner = f"\n" + inner; # # close_tag = "
{optional_caption}
\n"; # # for o in inner.split("\n"): # print(o); # # return "\n\n" + open_tag + inner + close_tag + "\n\n"; def handle_table(m): print("handle_table"); assert(not "TODO"); # matched = m.group(0); # optional_table_open = m.group(1); # one_or_more_header_lines = m.group(2); # header_lines = one_or_more_header_lines.strip().split("\n") # seperator_line = m.group(3); # one_or_more_body_lines = m.group(4); # body_lines = [e.strip().split("\n") for e in one_or_more_body_lines.strip().split("\n\n")] # optional_caption = m.group(5); # assert(seperator_line is not None) # try: # # handle explicit table tag? # if optional_table_open: # open_tag = optional_table_open + "\n"; # else: # # otherwise, add a default one: # open_tag = "" + "\n"; # inner = ""; # state = State(section_tag = "thead", default_cell_tag = "th"); # # Process the header lines: # for line in header_lines: # inner = do_table_line(state, line); # if state.already_opened_section: # inner += f"" "\n"; # # Handle line seperator: # column_info = parse_colinfo(seperator_line); # # Process the body lines: # for lines in body_lines: # state = State(section_tag = "tbody", \ # default_cell_tag = "td", \ # column_info = column_info); # for line in lines: # inner += do_table_line(state, line); # if state.already_opened_section: # inner += f"" "\n"; # # Consider the optional caption. # # If it happens, it goes before everything else # if optional_caption: # inner = f"\n" + inner; # close_tag = "
{optional_caption}
\n"; # return "\n\n" + open_tag + inner + close_tag + "\n\n"; # except SyntaxError as e: # print(f"caught syntax error: {e}"); # print("moving on to next table..."); # return m.group(0); def handle_table_no_sep(m): print("handle_table_no_sep"); assert(not "TODO"); # matched = m.group(0); # print(f'matched = """{matched}"""'); # table_open_tag = m.group(1) + "\n"; # one_or_more_body_lines = m.group(2); # body_lines = [e.strip().split("\n") for e in one_or_more_body_lines.strip().split("\n\n")] # optional_caption = m.group(3); # try: # inner = ""; # # Process the body lines: # for lines in body_lines: # state = State(section_tag = "tbody", \ # default_cell_tag = "td", \ # column_info = []); # for line in lines: # inner += do_table_line(state, line); # if state.already_opened_section: # inner += f"" "\n"; # # Consider the optional caption. # # If it happens, it goes before everything else # if optional_caption: # inner = f" {optional_caption} \n" + inner; # table_close_tag = "\n"; # return "\n\n" + table_open_tag + inner + table_close_tag + "\n\n"; # except SyntaxError as e: # print(f"caught syntax error: {e}"); # print("moving on to next table..."); # return m.group(0); def handle_table_case_1(m): print("handle_table_case_1"); matched = m.group(0); print(f'matched = """{matched}"""'); # required open table tag: table_open_tag = m.group(1); # remove the 'markdown="1"' syntax table_open_tag = re.sub(r"markdown=(?:\"1\"|'1'|1)", "", table_open_tag); # zero or more header rows: header_rows = m.group(2); # required seperator line: seperator_line = m.group(3); # zero or more body rows, with empty lines of one: body_rows = m.group(4); # optional caption: optional_caption = m.group(5); try: inner = ""; # Process the (optional) header lines: if header_rows is not None: state = State(section_tag = "thead", default_cell_tag = "th"); for line in header_rows.strip().split('\n'): inner += do_table_line(state, line); if state.already_opened_section: inner += f"" "\n"; # Handle line seperator: column_info = parse_colinfo(seperator_line); if body_rows is not None and body_rows.strip(): for body in body_rows.strip().split('\n\n'): state = State(section_tag = "tbody", \ default_cell_tag = "td", \ column_info = column_info); for line in body.strip().split('\n'): inner += do_table_line(state, line); if state.already_opened_section: inner += f"" "\n"; # Consider the optional caption. # If it happens, it goes before everything else if optional_caption: inner = f" {optional_caption} \n" + inner; table_open_tag = table_open_tag + "\n"; table_close_tag = "\n"; return "\n\n" + table_open_tag + inner + table_close_tag + "\n"; except SyntaxError as e: print(f"caught syntax error: {e}"); print("moving on to next table..."); return m.group(0); def handle_table_case_2(m): print("handle_table_case_2"); matched = m.group(0); print(f'matched = """{matched}"""'); # no open table tag: # one or more header rows: header_rows = m.group(1); # line seperator: seperator_line = m.group(2); print(f'seperator_line = "{seperator_line.strip()}"'); # one or more body rows, with empty lines of one: body_rows = m.group(3); print(f'body_rows = "{body_rows}"'); # optional caption: optional_caption = m.group(4); # no close table tag: try: inner = ""; state = State(section_tag = "thead", default_cell_tag = "th"); # Process the required header lines: for line in header_rows.strip().split('\n'): inner += do_table_line(state, line); if state.already_opened_section: inner += f"" "\n"; # Handle line seperator: column_info = parse_colinfo(seperator_line); for body in body_rows.strip().split('\n\n'): state = State(section_tag = "tbody", \ default_cell_tag = "td", \ column_info = column_info); for line in body.strip().split('\n'): inner += do_table_line(state, line); if state.already_opened_section: inner += f"" "\n"; # Consider the optional caption. # If it happens, it goes before everything else if optional_caption: inner = f" {optional_caption} \n" + inner; table_open_tag = "\n"; table_close_tag = "
\n"; return "\n\n" + table_open_tag + inner + table_close_tag + "\n"; except SyntaxError as e: print(f"caught syntax error: {e}"); print("moving on to next table..."); return m.group(0); def handle_table_case_3(m): print("handle_table_case_3"); matched = m.group(0); print(f'matched = """{matched}"""'); # required open table tag: table_open_tag = m.group(1); # remove the 'markdown="1"' syntax table_open_tag = re.sub(r"markdown=(?:\"1\"|'1'|1)", "", table_open_tag); # one or more body rows, with empty lines of one: body_rows = m.group(2); # no line seperator # optional caption: optional_caption = m.group(3); # optional close table tag. try: inner = ""; for body in body_rows.strip().split('\n\n'): state = State(section_tag = "tbody", \ default_cell_tag = "td", \ column_info = []); for line in body.strip().split('\n'): inner += do_table_line(state, line); if state.already_opened_section: inner += f"" "\n"; # Consider the optional caption. # If it happens, it goes before everything else if optional_caption: inner = f" {optional_caption} \n" + inner; table_open_tag = table_open_tag + "\n"; table_close_tag = "\n"; return "\n\n" + table_open_tag + inner + table_close_tag + "\n"; except SyntaxError as e: print(f"caught syntax error: {e}"); print("moving on to next table..."); return m.group(0); with open("test.md") as stream: text = stream.read(); # delimiters between cells delimiter = r"(?:[|]|<(?:td|th)(?:[\s]+[^<>]*)?>)"; # A row is anything with at least one delimiter row = fr"(?: .* {delimiter} .*)"; # Most table parts are simple. table_tags = r"(?:colgroup|col|thead|tr|tbody|tfoot|caption)" table_part = fr"(?:<{table_tags}(?:\s+[^<>]*)?>|)"; # captions eat until the next table part or delimiter # temp = fr"(?: {table_part} | )"; # caption_table_part = fr"(?: (?! (?: {delimiter} | {temp})) .*)" # caption_table_part = fr"(?: (?!{delimiter}) .*)" # caption_table_part = fr"(?: [^<>]* )" caption_table_part = fr"(?: [^<>]*)" table_part = fr"(?: {table_part} | {caption_table_part})"; # A table line can also be all table parts row = fr"(?:{row} | {table_part}+)"; # Between the header rows and the body rows there is a line seperator. seperator_line = r"\s* [|]? \s* [-=:]+ \s* (?: \s* [|] \s* [-=:]* \s* )* \s*" table = fr""" # two blank lines: [\n]{{2}} # required open table tag: (?:(]*markdown=(?:"1"|'1'|1)[^<>]*>) \n) # zero or more header rows: ((?: {row} \n)+) # required line seperator: ({seperator_line} [\n]) # zero or more body rows, with empty lines of one: ((?: {row} [\n]{{1,2}})*) # optional caption: (?: \[ ([a-z0-9 "']+) \] \n)? # optional close table tag: (?: [\n])? # two blank lines (another newline already matched earlier) [\n]{{1}} """; text = re.sub(table, handle_table_case_1, text, flags=re.VERBOSE) table = fr""" # two blank lines: [\n]{{2}} # no open table tag: # (?:(]*markdown=(?:"1"|'1'|1)[^<>]*>) \n)? # one or more header rows: ((?: {row} \n)+) # line seperator: ({seperator_line} [\n]) # one or more body rows, with empty lines of one: ((?: {row} [\n]{{1,2}})+) # optional caption: (?: \[ ([a-z0-9 "']+) \] \n)? # no close table tag: # (?: [\n])? # two blank lines (another newline already matched earlier) [\n]{{1}} """; text = re.sub(table, handle_table_case_2, text, flags=re.VERBOSE) table = fr""" # two blank lines: [\n]{{2}} # required open table tag: (?:(]*markdown=(?:"1"|'1'|1)[^<>]*>) \n) # one or more body rows, with empty lines of one: ((?: {row} [\n]{{1,2}})+) # no line seperator # optional caption: (?: \[ ([a-z0-9 "']+) \] \n)? # optional close table tag: (?: [\n])? # two blank lines (another newline already matched earlier) [\n]{{1}} """; text = re.sub(table, handle_table_case_3, text, flags=re.VERBOSE) text += """ """; with open("test.html", "w") as stream: stream.write(text);