zanders-php-markdown/test.py
2025-11-23 17:04:30 -06:00

658 lines
15 KiB
Python

import re;
def find_closer(row):
# if (m := re.match(r"([^|<]*)([|]+)", row)):
# print(f'found vbar ("{m.group(2)}")');
# content = m.group(1).strip();
# print(f'content = "{content}"');
# num_vbars = len(m.group(2));
# if num_vbars > 1:
# attributes['colspan'] = num_vbars;
# row = row[len(m.group(0)):];
# print(f'row = "{row}"');
# elif (m := re.match(r"([^|<]*)</(td|th|tr)>", row)):
# print(f"found close HTML tag");
# content = m.group(1).strip();
# print(f'content = "{content}"');
# row = row[len(m.group(0)):];
# print(f'row = "{row}"');
# elif (m := re.match(r"([^|<]*)<(th|td) ", row)):
# print(f"found open HTML tag ({m.group(2)})");
# content = m.group(1).strip();
# print(f'content = "{content}"');
# row = row[len(m.group(1)):];
# print(f'row = "{row}"');
# elif (m := re.match(r"([^<|]*)$", row)):
# print(f"found eol");
# content = m.group(1).strip();
# print(f'content = "{content}"');
# row = row[len(m.group(0)):];
# print(f'row = "{row}"');
# else:
# print(f"could not find valid closer? huh?!? throwing!");
# raise SyntaxError("could not valid closer for cell");
if depth:
assert(not "TODO");
return content, row;
# def do_row(row, column_info, default_tag = "td"):
# print(f'row = "{row}"');
#
# row = row.lstrip();
#
# out = "";
#
# column_index = 0;
#
# while row:
# print("new cell");
#
# tag = "";
#
# attributes = {};
#
# if column_index < len(column_info):
# align = column_info[column_index]['align'];
#
# if align != 'default':
# attributes['align'] = align;
#
# print(f'attributes = "{attributes}"');
#
# print("looking for starter");
#
# if (m := re.match(r"^[|]", row)):
# print("found opening vbar");
#
# if column_index < len(column_info):
# tag = column_info[column_index]['default-tag']
# else:
# tag = default_tag;
#
# print(f'tag = "{tag}"');
#
# row = row[1:];
#
# print(f'row = "{row}"');
#
# elif (m := re.match(r"^<([a-z]+)(?: +([^<>]*))?>", row)):
# print("found opening HTML tag");
#
# tag = m.group(1);
#
# print(f'tag = "{tag}"');
#
# raw_attributes = m.group(2);
#
# while raw_attributes:
# if (mm := re.match(f'\s*([a-z0-9]+)\s*=\s*"([^"]+)"', raw_attributes)):
# attribute = mm.group(1);
#
# value = mm.group(2);
#
# print(f'attribute = "{attribute}"');
#
# print(f'value = "{value}"');
#
# if attribute in attributes and attribute == "style":
# attributes[attribute] += '; ' + value;
# else:
# attributes[attribute] = value;
#
# print(f'attributes = "{attributes}"');
#
# raw_attributes = raw_attributes[len(mm.group(0)):];
#
# print(f'raw_attributes = "{raw_attributes}"');
# else:
# print(f'could not parse HTML attributes?! huh?! throwing! ');
#
# raise SyntaxError("could not parse HTML attributes");
#
# row = row[len(m.group(0)):];
#
# print(f'row = "{row}"');
# elif (m := re.match(r"^<([a-z]+)", row)):
# print("found HTML open, but it's incomplete? huh?! throwing!");
#
# raise SyntaxError("could not find '>' for HTML open tag");
# elif column_index < len(column_info):
# print("found nothing, defaulting to column info");
#
# tag = column_info[column_index]['default-tag']
#
# print(f'tag = "{tag}"');
# else:
# print(f"found nothing, defaulting default_tag ({default_tag})");
#
# tag = default_tag;
#
# print(f'tag = "{tag}"');
#
# print("looking for closer");
#
# content = "";
#
# depth = 0;
#
# while row:
# if (m := re.match("\s+", row)):
# content += m.group(0);
# row = row[len(m.group(0)):];
# print(f'row = "{row}"');
# elif (m := re.match("\w+", row)):
# content += m.group(0);
# row = row[len(m.group(0)):];
# print(f'row = "{row}"');
# elif (m := re.match("[.,]+", row)):
# content += m.group(0);
# row = row[len(m.group(0)):];
# print(f'row = "{row}"');
# elif not depth and (m := re.match("<(td|tr)", row)):
# print("found HTML open tag: {m.group(1)}");
# break;
# elif not depth and (m := re.match("</(td|tr)>", row)):
# print("found HTML close tag: {m.group(1)}");
# row = row[len(m.group(0)):]
# print(f'row = "{row}"');
# break;
# elif not depth and (m := re.match(r"([|]+)", row)):
# print(f'found vbar ("{m.group(1)}")');
#
# num_vbars = len(m.group(1));
#
# if num_vbars > 1:
# attributes['colspan'] = num_vbars;
#
# row = row[len(m.group(0)):]
# print(f'row = "{row}"');
#
# break;
# elif (m := re.match("<table>", row)):
# content += m.group(0);
# depth += 1;
# row = row[len(m.group(0)):]
# print(f'row = "{row}"');
# print(f'depth = {depth}');
# elif (m := re.match("</table>", row)):
# content += m.group(0);
# depth -= 1;
# row = row[len(m.group(0)):]
# print(f'row = "{row}"');
# print(f'depth = {depth}');
# elif (m := re.match("<[a-z]+(?:\s+[^<>]*)?>", row)):
# content += m.group(0);
# row = row[len(m.group(0)):]
# print(f'row = "{row}"');
# elif (m := re.match("</[a-z]+>", row)):
# content += m.group(0);
# row = row[len(m.group(0)):]
# print(f'row = "{row}"');
# elif (m := re.match(r"`[^`\n]+`", row)):
# assert(not "TODO");
# else:
# print(f'row = "{row}"');
# assert(not "TODO");
#
# if attributes:
# attributes = " ".join(f'{key}="{value}"'
# for key, value in sorted(attributes.items()));
#
# cell = f"<{tag} {attributes}> {content} </{tag}>";
# else:
# cell = f"<{tag}> {content} </{tag}>";
#
# print(f'cell = "{cell}"');
#
# out += cell;
#
# column_index += 1;
#
# print(f'out = "{out}"');
#
# return out;
def parse_colinfo(linesep):
column_info = [];
linesep = linesep.strip();
print(f"linesep = {linesep}");
if linesep.startswith("|"):
linesep = linesep[1:]
print(f"linesep = {linesep}");
if linesep.endswith("|"):
linesep = linesep[:-1]
print(f"linesep = {linesep}");
column_info = [];
for sep in linesep.split("|"):
sep = sep.strip()
print(f'sep = "{sep}"');
if re.match(r"^:-+$", sep):
appendme = {'align': 'left', 'default-tag': 'td'};
elif re.match(r"^-+:$", sep):
appendme = {'align': 'right', 'default-tag': 'td'};
elif re.match(r"^:-+:$", sep):
appendme = {'align': 'center', 'default-tag': 'td'};
elif re.match(r"^-+$", sep):
appendme = {'align': 'default', 'default-tag': 'td'};
elif re.match(r"^:=+$", sep):
appendme = {'align': 'left', 'default-tag': 'th'};
elif re.match(r"^=+:$", sep):
appendme = {'align': 'right', 'default-tag': 'th'};
elif re.match(r"^:=+:$", sep):
appendme = {'align': 'center', 'default-tag': 'th'};
elif re.match(r"^=+$", sep):
appendme = {'align': 'default', 'default-tag': 'th'};
else:
assert(not "TODO");
print(f'appendme = {appendme}');
column_info.append(appendme);
return column_info;
def do_table_row():
# we can eat zero or more table parts, which special handling for "<caption>"
# we may find the end of the line after all table parts have run
# when if we find something that's not a table part, it's gotta be content
# start reading through the cells, building them up
assert(not "TODO");
def do_table(header, linesep, body):
assert(not "TODO");
# passthroughs = [
# r"<colgroup(?:\s+.*)?>",
# r"</colgroup>",
# r"<col(?:\s+.*)?>",
# r"</col>",
# r"<tfoot>",
# r"</tfoot>",
# ];
# output_table = "<table>";
# inside_thead = 0;
# inside_tr = 0;
# print("processing header rows");
# for row in header.split("\n"):
# print(f'row = "{row}"');
# while row:
# if (m := re.match(r'\s+', row)):
# row = row[len(m.group(0)):];
# print(f'row = "{row}"');
# continue;
# for passthrough in passthroughs:
# if row.startswith(passthrough):
# assert(not "TODO");
# continue;
# # explicit caption with optional attributes
# if (m := re.match(r"<caption(?:\s+([^<>\n]*))?>([^\n]+)</caption>", row)):
# print(f'found explicit caption tag: "{m.group(0)}"');
# output_table += m.group(0);
# row = row[len(m.group(0)):];
# print(f'row = "{row}"');
# wrote_thead = 1;
# continue;
# # send thead through, but remember we shouldn't do one outselves
# if (m := re.match(r'<thead(?:\s+[^<>\n]*)?>', row)):
# print(f'found explicit thead tag: "{m.group(0)}"');
# output_table += m.group(0);
# inside_thead = 1;
# row = row[len(m.group(0)):];
# print(f'row = "{row}"');
# continue;
#
# # does this row have an explicit HTML tag?
# if (m := re.match(r"<tr(?:\s+[^<>\n]*)?>", row)):
# print(f'found explicit tr tag: "{m.group(0)}"');
# output_table += m.group(0);
# inside_tr = 1;
# row = row[len(m.group(0)):];
# print(f'row = "{row}"');
# continue;
# print("thead content");
#
# if not inside_thead:
# output_table += "<thead>";
# wrote_thead = 1;
#
# if not inside_tr:
# output_table += "<tr>";
# # parse row line
# html_row = do_row(row, [], default_tag = "th");
#
# print(f'html_row = "{html_row}"');
# output_table += html_row;
# output_table += "</tr>";
# inside_tr = 0;
# break;
# print(f'output_table = "{output_table}"');
# column_info = parse_colinfo(linesep);
# first_tbody = 1;
# inside_tbody = 0;
# inside_tr = 0;
# for row in body.strip().split('\n'):
# print(f'row = "{row}"');
# if not row.strip():
# print("blank line; new tbody");
# inside_tbody = 0;
# continue;
# while row:
# if (m := re.match(r'\s+', row)):
# row = row[len(m.group(0)):];
# print(f'row = "{row}"');
# continue;
# for passthrough in passthroughs:
# if row.startswith(passthrough):
# assert(not "TODO");
# continue;
# # explicit caption with optional attributes
# if (m := re.match(r"<caption(?:\s+([^<>\n]*))?>([^\n]+)</caption>", row)):
# print(f'found explicit caption tag: "{m.group(0)}"');
# # caption eat content until tablepart or delimitor
# # we don't eat either the tablepart of the delimitor, let
# # whatever future logic process that
# assert(not "TODO");
# # output_table += m.group(0);
# # row = row[len(m.group(0)):];
# # print(f'row = "{row}"');
# # wrote_thead = 1;
# # continue;
# # explicit HTML tbody. pass it through and take note
# if (m := re.match(r'<tbody(?:\s+.*)?>', row)):
# print(f'found explicit tbody tag: "{m.group(0)}"');
# output_table += m.group(0);
# inside_tbody = 1;
# row = row[len(m.group(0)):];
# print(f'row = "{row}"');
# wrote_thead = 1;
# continue;
# # does this row have an explicit HTML tag?
# if (m := re.match("<tr(?:\s+.*)?>", row)):
# print(f'found explicit tr tag: "{m.group(0)}"');
# output_table += m.group(0);
# inside_tr = 1;
# row = row[len(m.group(0)):];
# print(f'row = "{row}"');
# continue;
# print("tbody content");
# if not inside_tbody:
# output_table += "<tbody>";
#
# if not inside_tr:
# output_table += "<tr>";
# html_row = do_row(row, column_info, default_tag = "td");
#
# print(f'html_row = "{html_row}"');
# output_table += html_row;
# output_table += "</tr>";
# inside_tr = 0;
# break;
# # if (caption := mm.group(4)):
# # output_table += f'<caption> {caption} </caption>'
# # assert(not "CHECK");
# output_table += "</table>";
# return "\n\n" + output_table + "\n\n";
def handle_table(m):
header = m.group(1);
linesep = m.group(2);
body = m.group(3);
try:
return "<table>" + do_table(header, linesep, body) + "</table>";
except SyntaxError as e:
print(f"caught syntax error: {e}");
print("moving on to next table...");
return m.group(0);
# def handle_table2(m):
# table_attributes = m.group(1) or "";
# header = m.group(2);
# linesep = m.group(3);
# body = m.group(4);
#
# try:
# return f"<table {table_attributes}>" + do_table(header, linesep, body) + "</table>";
# except SyntaxError as e:
# print(f"caught syntax error: {e}");
# print("moving on to next table...");
# return m.group(0);
with open("test.txt") as stream:
text = stream.read();
# delimiters between cells
delimiter = r"(?: [|] | <tr> | <th>)";
# A row is anything with at least one delimiter
row = fr"(?: .* {delimiter} .*)";
# Most table parts are simple.
table_part = "(?: <colgroup> | <col> | <thead> | <tbody> | <tfoot> | </caption>)";
# captions eat until the next table part or delimiter
temp = fr"(?: {table_part} | <caption>)";
# caption_table_part = fr"(?: <caption> (?! (?: {delimiter} | {temp})) .*)"
# caption_table_part = fr"(?: <caption> (?!{delimiter}) .*)"
caption_table_part = fr"(?: <caption> [^<>]* </caption>)"
table_part = fr"(?: {table_part} | {caption_table_part})";
# A table line can also be all table parts
row = fr"(?:{row} | {table_part}+)";
# Between the header rows and the body rows there is a line seperator.
seperator_line = r"\s* [|]? \s* [-=:]+ \s* (?: \s* [|] \s* [-=:]* \s* )* \s*"
# Regex for whole table:
table = fr"""
# two blank lines:
[\n]{2}
# optional open table tag:
(?: (<table>)? \n)
# one or more header rows:
((?: {row} \n{{1,2}}))+
# line seperator:
({seperator_line}) \n
# one or more body rows:
((?: {row} \n{{1,2}}))+
# optional caption:
(?: \[ ([a-z0-9 "']+) \] \n)?
# optional close table tag:
(?: (</table>)? \n)
# two blank lines:
[\n]{2}
""";
print(table);
text = re.sub(table, handle_table, text, flags=re.VERBOSE)
#
#
#
# text = re.sub(r"""
# # blank line before:
# [\n]{1}
#
# # header:
# ( (?: [^\n]+ \n)+ )
#
# # seperator line:
# (\s* [|]? \s* [-=:]+ \s* (?: \s* [|] \s* [-=:]* \s* )* \s*) \n
#
# # body, which is nothing but table parts, or contains at least one
#
# # delimiter ("|", "<td>", "<th>"), or a line
#
# # "<caption>" is special table tag, that may contain anything up to:
# # the end of line
# # other tableparts
# # explicit start of cell
#
# ((?:
# # It contains a deliminator OR it's enitrely table parts
# ( .* ("|" | "<tr>" | "<th>" ) .*)
# | ( "<colgroup>"
# | "<col>"
# | "<thead>"
# | "<tbody>"
# | "<tfoot>"
# | "<caption>" (?!^ (("|" | "<tr>" | "<th>") | table part))+ )+
#
# # optionally extra newline after
# \n?
# )+)
#
# # optional caption: (conflicts with tom cells, or even just normal rows)
# # (?: (\[ [^][\n]+ \]) \n )?
#
# # blank line after:
# [\n]{1}
# """, handle_table, text, flags=re.VERBOSE)
#
# print(f'text = """{text}"""');
#
# text += """
# <style>
# table
# {
# border-collapse: collapse;
# margin: 1em 0;
# }
#
# th, td
# {
# border: thin solid grey;
# padding: 0.5em;
# }
# </style>
# """;
#
# with open("test.html", "w") as stream:
# stream.write(text);
#