658 lines
15 KiB
Python
658 lines
15 KiB
Python
|
|
import re;
|
|
|
|
def find_closer(row):
|
|
|
|
# if (m := re.match(r"([^|<]*)([|]+)", row)):
|
|
# print(f'found vbar ("{m.group(2)}")');
|
|
|
|
# content = m.group(1).strip();
|
|
|
|
# print(f'content = "{content}"');
|
|
|
|
# num_vbars = len(m.group(2));
|
|
|
|
# if num_vbars > 1:
|
|
# attributes['colspan'] = num_vbars;
|
|
|
|
# row = row[len(m.group(0)):];
|
|
|
|
# print(f'row = "{row}"');
|
|
# elif (m := re.match(r"([^|<]*)</(td|th|tr)>", row)):
|
|
# print(f"found close HTML tag");
|
|
|
|
# content = m.group(1).strip();
|
|
|
|
# print(f'content = "{content}"');
|
|
|
|
# row = row[len(m.group(0)):];
|
|
|
|
# print(f'row = "{row}"');
|
|
# elif (m := re.match(r"([^|<]*)<(th|td) ", row)):
|
|
# print(f"found open HTML tag ({m.group(2)})");
|
|
|
|
# content = m.group(1).strip();
|
|
|
|
# print(f'content = "{content}"');
|
|
|
|
# row = row[len(m.group(1)):];
|
|
|
|
# print(f'row = "{row}"');
|
|
# elif (m := re.match(r"([^<|]*)$", row)):
|
|
# print(f"found eol");
|
|
|
|
# content = m.group(1).strip();
|
|
|
|
# print(f'content = "{content}"');
|
|
|
|
# row = row[len(m.group(0)):];
|
|
|
|
# print(f'row = "{row}"');
|
|
# else:
|
|
# print(f"could not find valid closer? huh?!? throwing!");
|
|
|
|
# raise SyntaxError("could not valid closer for cell");
|
|
|
|
if depth:
|
|
assert(not "TODO");
|
|
|
|
return content, row;
|
|
|
|
# def do_row(row, column_info, default_tag = "td"):
|
|
# print(f'row = "{row}"');
|
|
#
|
|
# row = row.lstrip();
|
|
#
|
|
# out = "";
|
|
#
|
|
# column_index = 0;
|
|
#
|
|
# while row:
|
|
# print("new cell");
|
|
#
|
|
# tag = "";
|
|
#
|
|
# attributes = {};
|
|
#
|
|
# if column_index < len(column_info):
|
|
# align = column_info[column_index]['align'];
|
|
#
|
|
# if align != 'default':
|
|
# attributes['align'] = align;
|
|
#
|
|
# print(f'attributes = "{attributes}"');
|
|
#
|
|
# print("looking for starter");
|
|
#
|
|
# if (m := re.match(r"^[|]", row)):
|
|
# print("found opening vbar");
|
|
#
|
|
# if column_index < len(column_info):
|
|
# tag = column_info[column_index]['default-tag']
|
|
# else:
|
|
# tag = default_tag;
|
|
#
|
|
# print(f'tag = "{tag}"');
|
|
#
|
|
# row = row[1:];
|
|
#
|
|
# print(f'row = "{row}"');
|
|
#
|
|
# elif (m := re.match(r"^<([a-z]+)(?: +([^<>]*))?>", row)):
|
|
# print("found opening HTML tag");
|
|
#
|
|
# tag = m.group(1);
|
|
#
|
|
# print(f'tag = "{tag}"');
|
|
#
|
|
# raw_attributes = m.group(2);
|
|
#
|
|
# while raw_attributes:
|
|
# if (mm := re.match(f'\s*([a-z0-9]+)\s*=\s*"([^"]+)"', raw_attributes)):
|
|
# attribute = mm.group(1);
|
|
#
|
|
# value = mm.group(2);
|
|
#
|
|
# print(f'attribute = "{attribute}"');
|
|
#
|
|
# print(f'value = "{value}"');
|
|
#
|
|
# if attribute in attributes and attribute == "style":
|
|
# attributes[attribute] += '; ' + value;
|
|
# else:
|
|
# attributes[attribute] = value;
|
|
#
|
|
# print(f'attributes = "{attributes}"');
|
|
#
|
|
# raw_attributes = raw_attributes[len(mm.group(0)):];
|
|
#
|
|
# print(f'raw_attributes = "{raw_attributes}"');
|
|
# else:
|
|
# print(f'could not parse HTML attributes?! huh?! throwing! ');
|
|
#
|
|
# raise SyntaxError("could not parse HTML attributes");
|
|
#
|
|
# row = row[len(m.group(0)):];
|
|
#
|
|
# print(f'row = "{row}"');
|
|
# elif (m := re.match(r"^<([a-z]+)", row)):
|
|
# print("found HTML open, but it's incomplete? huh?! throwing!");
|
|
#
|
|
# raise SyntaxError("could not find '>' for HTML open tag");
|
|
# elif column_index < len(column_info):
|
|
# print("found nothing, defaulting to column info");
|
|
#
|
|
# tag = column_info[column_index]['default-tag']
|
|
#
|
|
# print(f'tag = "{tag}"');
|
|
# else:
|
|
# print(f"found nothing, defaulting default_tag ({default_tag})");
|
|
#
|
|
# tag = default_tag;
|
|
#
|
|
# print(f'tag = "{tag}"');
|
|
#
|
|
# print("looking for closer");
|
|
#
|
|
# content = "";
|
|
#
|
|
# depth = 0;
|
|
#
|
|
# while row:
|
|
# if (m := re.match("\s+", row)):
|
|
# content += m.group(0);
|
|
# row = row[len(m.group(0)):];
|
|
# print(f'row = "{row}"');
|
|
# elif (m := re.match("\w+", row)):
|
|
# content += m.group(0);
|
|
# row = row[len(m.group(0)):];
|
|
# print(f'row = "{row}"');
|
|
# elif (m := re.match("[.,]+", row)):
|
|
# content += m.group(0);
|
|
# row = row[len(m.group(0)):];
|
|
# print(f'row = "{row}"');
|
|
# elif not depth and (m := re.match("<(td|tr)", row)):
|
|
# print("found HTML open tag: {m.group(1)}");
|
|
# break;
|
|
# elif not depth and (m := re.match("</(td|tr)>", row)):
|
|
# print("found HTML close tag: {m.group(1)}");
|
|
# row = row[len(m.group(0)):]
|
|
# print(f'row = "{row}"');
|
|
# break;
|
|
# elif not depth and (m := re.match(r"([|]+)", row)):
|
|
# print(f'found vbar ("{m.group(1)}")');
|
|
#
|
|
# num_vbars = len(m.group(1));
|
|
#
|
|
# if num_vbars > 1:
|
|
# attributes['colspan'] = num_vbars;
|
|
#
|
|
# row = row[len(m.group(0)):]
|
|
# print(f'row = "{row}"');
|
|
#
|
|
# break;
|
|
# elif (m := re.match("<table>", row)):
|
|
# content += m.group(0);
|
|
# depth += 1;
|
|
# row = row[len(m.group(0)):]
|
|
# print(f'row = "{row}"');
|
|
# print(f'depth = {depth}');
|
|
# elif (m := re.match("</table>", row)):
|
|
# content += m.group(0);
|
|
# depth -= 1;
|
|
# row = row[len(m.group(0)):]
|
|
# print(f'row = "{row}"');
|
|
# print(f'depth = {depth}');
|
|
# elif (m := re.match("<[a-z]+(?:\s+[^<>]*)?>", row)):
|
|
# content += m.group(0);
|
|
# row = row[len(m.group(0)):]
|
|
# print(f'row = "{row}"');
|
|
# elif (m := re.match("</[a-z]+>", row)):
|
|
# content += m.group(0);
|
|
# row = row[len(m.group(0)):]
|
|
# print(f'row = "{row}"');
|
|
# elif (m := re.match(r"`[^`\n]+`", row)):
|
|
# assert(not "TODO");
|
|
# else:
|
|
# print(f'row = "{row}"');
|
|
# assert(not "TODO");
|
|
#
|
|
# if attributes:
|
|
# attributes = " ".join(f'{key}="{value}"'
|
|
# for key, value in sorted(attributes.items()));
|
|
#
|
|
# cell = f"<{tag} {attributes}> {content} </{tag}>";
|
|
# else:
|
|
# cell = f"<{tag}> {content} </{tag}>";
|
|
#
|
|
# print(f'cell = "{cell}"');
|
|
#
|
|
# out += cell;
|
|
#
|
|
# column_index += 1;
|
|
#
|
|
# print(f'out = "{out}"');
|
|
#
|
|
# return out;
|
|
|
|
def parse_colinfo(linesep):
|
|
column_info = [];
|
|
|
|
linesep = linesep.strip();
|
|
|
|
print(f"linesep = {linesep}");
|
|
|
|
if linesep.startswith("|"):
|
|
linesep = linesep[1:]
|
|
|
|
print(f"linesep = {linesep}");
|
|
|
|
if linesep.endswith("|"):
|
|
linesep = linesep[:-1]
|
|
|
|
print(f"linesep = {linesep}");
|
|
|
|
column_info = [];
|
|
|
|
for sep in linesep.split("|"):
|
|
sep = sep.strip()
|
|
|
|
print(f'sep = "{sep}"');
|
|
|
|
if re.match(r"^:-+$", sep):
|
|
appendme = {'align': 'left', 'default-tag': 'td'};
|
|
elif re.match(r"^-+:$", sep):
|
|
appendme = {'align': 'right', 'default-tag': 'td'};
|
|
elif re.match(r"^:-+:$", sep):
|
|
appendme = {'align': 'center', 'default-tag': 'td'};
|
|
elif re.match(r"^-+$", sep):
|
|
appendme = {'align': 'default', 'default-tag': 'td'};
|
|
elif re.match(r"^:=+$", sep):
|
|
appendme = {'align': 'left', 'default-tag': 'th'};
|
|
elif re.match(r"^=+:$", sep):
|
|
appendme = {'align': 'right', 'default-tag': 'th'};
|
|
elif re.match(r"^:=+:$", sep):
|
|
appendme = {'align': 'center', 'default-tag': 'th'};
|
|
elif re.match(r"^=+$", sep):
|
|
appendme = {'align': 'default', 'default-tag': 'th'};
|
|
else:
|
|
assert(not "TODO");
|
|
|
|
print(f'appendme = {appendme}');
|
|
|
|
column_info.append(appendme);
|
|
|
|
return column_info;
|
|
|
|
def do_table_row():
|
|
|
|
# we can eat zero or more table parts, which special handling for "<caption>"
|
|
|
|
# we may find the end of the line after all table parts have run
|
|
|
|
# when if we find something that's not a table part, it's gotta be content
|
|
|
|
# start reading through the cells, building them up
|
|
|
|
assert(not "TODO");
|
|
|
|
def do_table(header, linesep, body):
|
|
|
|
assert(not "TODO");
|
|
|
|
# passthroughs = [
|
|
# r"<colgroup(?:\s+.*)?>",
|
|
# r"</colgroup>",
|
|
# r"<col(?:\s+.*)?>",
|
|
# r"</col>",
|
|
# r"<tfoot>",
|
|
# r"</tfoot>",
|
|
# ];
|
|
|
|
# output_table = "<table>";
|
|
|
|
# inside_thead = 0;
|
|
# inside_tr = 0;
|
|
|
|
# print("processing header rows");
|
|
|
|
# for row in header.split("\n"):
|
|
# print(f'row = "{row}"');
|
|
|
|
# while row:
|
|
# if (m := re.match(r'\s+', row)):
|
|
# row = row[len(m.group(0)):];
|
|
|
|
# print(f'row = "{row}"');
|
|
|
|
# continue;
|
|
|
|
# for passthrough in passthroughs:
|
|
# if row.startswith(passthrough):
|
|
# assert(not "TODO");
|
|
|
|
# continue;
|
|
|
|
# # explicit caption with optional attributes
|
|
# if (m := re.match(r"<caption(?:\s+([^<>\n]*))?>([^\n]+)</caption>", row)):
|
|
# print(f'found explicit caption tag: "{m.group(0)}"');
|
|
|
|
# output_table += m.group(0);
|
|
|
|
# row = row[len(m.group(0)):];
|
|
|
|
# print(f'row = "{row}"');
|
|
|
|
# wrote_thead = 1;
|
|
|
|
# continue;
|
|
|
|
# # send thead through, but remember we shouldn't do one outselves
|
|
# if (m := re.match(r'<thead(?:\s+[^<>\n]*)?>', row)):
|
|
# print(f'found explicit thead tag: "{m.group(0)}"');
|
|
|
|
# output_table += m.group(0);
|
|
|
|
# inside_thead = 1;
|
|
|
|
# row = row[len(m.group(0)):];
|
|
|
|
# print(f'row = "{row}"');
|
|
|
|
# continue;
|
|
#
|
|
# # does this row have an explicit HTML tag?
|
|
# if (m := re.match(r"<tr(?:\s+[^<>\n]*)?>", row)):
|
|
# print(f'found explicit tr tag: "{m.group(0)}"');
|
|
|
|
# output_table += m.group(0);
|
|
|
|
# inside_tr = 1;
|
|
|
|
# row = row[len(m.group(0)):];
|
|
|
|
# print(f'row = "{row}"');
|
|
|
|
# continue;
|
|
|
|
# print("thead content");
|
|
#
|
|
# if not inside_thead:
|
|
# output_table += "<thead>";
|
|
|
|
# wrote_thead = 1;
|
|
#
|
|
# if not inside_tr:
|
|
# output_table += "<tr>";
|
|
|
|
# # parse row line
|
|
# html_row = do_row(row, [], default_tag = "th");
|
|
#
|
|
# print(f'html_row = "{html_row}"');
|
|
|
|
# output_table += html_row;
|
|
|
|
# output_table += "</tr>";
|
|
|
|
# inside_tr = 0;
|
|
|
|
# break;
|
|
|
|
# print(f'output_table = "{output_table}"');
|
|
|
|
# column_info = parse_colinfo(linesep);
|
|
|
|
# first_tbody = 1;
|
|
|
|
# inside_tbody = 0;
|
|
# inside_tr = 0;
|
|
|
|
# for row in body.strip().split('\n'):
|
|
# print(f'row = "{row}"');
|
|
|
|
# if not row.strip():
|
|
# print("blank line; new tbody");
|
|
|
|
# inside_tbody = 0;
|
|
|
|
# continue;
|
|
|
|
# while row:
|
|
# if (m := re.match(r'\s+', row)):
|
|
# row = row[len(m.group(0)):];
|
|
|
|
# print(f'row = "{row}"');
|
|
|
|
# continue;
|
|
|
|
# for passthrough in passthroughs:
|
|
# if row.startswith(passthrough):
|
|
# assert(not "TODO");
|
|
|
|
# continue;
|
|
|
|
# # explicit caption with optional attributes
|
|
# if (m := re.match(r"<caption(?:\s+([^<>\n]*))?>([^\n]+)</caption>", row)):
|
|
# print(f'found explicit caption tag: "{m.group(0)}"');
|
|
|
|
# # caption eat content until tablepart or delimitor
|
|
|
|
# # we don't eat either the tablepart of the delimitor, let
|
|
# # whatever future logic process that
|
|
|
|
# assert(not "TODO");
|
|
# # output_table += m.group(0);
|
|
|
|
# # row = row[len(m.group(0)):];
|
|
|
|
# # print(f'row = "{row}"');
|
|
|
|
# # wrote_thead = 1;
|
|
|
|
# # continue;
|
|
|
|
# # explicit HTML tbody. pass it through and take note
|
|
# if (m := re.match(r'<tbody(?:\s+.*)?>', row)):
|
|
# print(f'found explicit tbody tag: "{m.group(0)}"');
|
|
|
|
# output_table += m.group(0);
|
|
|
|
# inside_tbody = 1;
|
|
|
|
# row = row[len(m.group(0)):];
|
|
|
|
# print(f'row = "{row}"');
|
|
|
|
# wrote_thead = 1;
|
|
|
|
# continue;
|
|
|
|
# # does this row have an explicit HTML tag?
|
|
# if (m := re.match("<tr(?:\s+.*)?>", row)):
|
|
# print(f'found explicit tr tag: "{m.group(0)}"');
|
|
|
|
# output_table += m.group(0);
|
|
|
|
# inside_tr = 1;
|
|
|
|
# row = row[len(m.group(0)):];
|
|
|
|
# print(f'row = "{row}"');
|
|
|
|
# continue;
|
|
|
|
# print("tbody content");
|
|
|
|
# if not inside_tbody:
|
|
# output_table += "<tbody>";
|
|
#
|
|
# if not inside_tr:
|
|
# output_table += "<tr>";
|
|
|
|
# html_row = do_row(row, column_info, default_tag = "td");
|
|
#
|
|
# print(f'html_row = "{html_row}"');
|
|
|
|
# output_table += html_row;
|
|
|
|
# output_table += "</tr>";
|
|
|
|
# inside_tr = 0;
|
|
|
|
# break;
|
|
|
|
# # if (caption := mm.group(4)):
|
|
# # output_table += f'<caption> {caption} </caption>'
|
|
|
|
# # assert(not "CHECK");
|
|
|
|
# output_table += "</table>";
|
|
|
|
# return "\n\n" + output_table + "\n\n";
|
|
|
|
def handle_table(m):
|
|
header = m.group(1);
|
|
linesep = m.group(2);
|
|
body = m.group(3);
|
|
|
|
try:
|
|
return "<table>" + do_table(header, linesep, body) + "</table>";
|
|
except SyntaxError as e:
|
|
print(f"caught syntax error: {e}");
|
|
print("moving on to next table...");
|
|
return m.group(0);
|
|
|
|
# def handle_table2(m):
|
|
# table_attributes = m.group(1) or "";
|
|
# header = m.group(2);
|
|
# linesep = m.group(3);
|
|
# body = m.group(4);
|
|
#
|
|
# try:
|
|
# return f"<table {table_attributes}>" + do_table(header, linesep, body) + "</table>";
|
|
# except SyntaxError as e:
|
|
# print(f"caught syntax error: {e}");
|
|
# print("moving on to next table...");
|
|
# return m.group(0);
|
|
|
|
with open("test.txt") as stream:
|
|
text = stream.read();
|
|
|
|
# delimiters between cells
|
|
delimiter = r"(?: [|] | <tr> | <th>)";
|
|
|
|
# A row is anything with at least one delimiter
|
|
row = fr"(?: .* {delimiter} .*)";
|
|
|
|
# Most table parts are simple.
|
|
table_part = "(?: <colgroup> | <col> | <thead> | <tbody> | <tfoot> | </caption>)";
|
|
|
|
# captions eat until the next table part or delimiter
|
|
temp = fr"(?: {table_part} | <caption>)";
|
|
# caption_table_part = fr"(?: <caption> (?! (?: {delimiter} | {temp})) .*)"
|
|
# caption_table_part = fr"(?: <caption> (?!{delimiter}) .*)"
|
|
caption_table_part = fr"(?: <caption> [^<>]* </caption>)"
|
|
table_part = fr"(?: {table_part} | {caption_table_part})";
|
|
|
|
# A table line can also be all table parts
|
|
row = fr"(?:{row} | {table_part}+)";
|
|
|
|
# Between the header rows and the body rows there is a line seperator.
|
|
seperator_line = r"\s* [|]? \s* [-=:]+ \s* (?: \s* [|] \s* [-=:]* \s* )* \s*"
|
|
|
|
# Regex for whole table:
|
|
table = fr"""
|
|
# two blank lines:
|
|
[\n]{2}
|
|
|
|
# optional open table tag:
|
|
(?: (<table>)? \n)
|
|
|
|
# one or more header rows:
|
|
((?: {row} \n{{1,2}}))+
|
|
|
|
# line seperator:
|
|
({seperator_line}) \n
|
|
|
|
# one or more body rows:
|
|
((?: {row} \n{{1,2}}))+
|
|
|
|
# optional caption:
|
|
(?: \[ ([a-z0-9 "']+) \] \n)?
|
|
|
|
# optional close table tag:
|
|
(?: (</table>)? \n)
|
|
|
|
# two blank lines:
|
|
[\n]{2}
|
|
""";
|
|
|
|
|
|
print(table);
|
|
|
|
text = re.sub(table, handle_table, text, flags=re.VERBOSE)
|
|
|
|
#
|
|
#
|
|
#
|
|
# text = re.sub(r"""
|
|
# # blank line before:
|
|
# [\n]{1}
|
|
#
|
|
# # header:
|
|
# ( (?: [^\n]+ \n)+ )
|
|
#
|
|
# # seperator line:
|
|
# (\s* [|]? \s* [-=:]+ \s* (?: \s* [|] \s* [-=:]* \s* )* \s*) \n
|
|
#
|
|
# # body, which is nothing but table parts, or contains at least one
|
|
#
|
|
# # delimiter ("|", "<td>", "<th>"), or a line
|
|
#
|
|
# # "<caption>" is special table tag, that may contain anything up to:
|
|
# # the end of line
|
|
# # other tableparts
|
|
# # explicit start of cell
|
|
#
|
|
# ((?:
|
|
# # It contains a deliminator OR it's enitrely table parts
|
|
# ( .* ("|" | "<tr>" | "<th>" ) .*)
|
|
# | ( "<colgroup>"
|
|
# | "<col>"
|
|
# | "<thead>"
|
|
# | "<tbody>"
|
|
# | "<tfoot>"
|
|
# | "<caption>" (?!^ (("|" | "<tr>" | "<th>") | table part))+ )+
|
|
#
|
|
# # optionally extra newline after
|
|
# \n?
|
|
# )+)
|
|
#
|
|
# # optional caption: (conflicts with tom cells, or even just normal rows)
|
|
# # (?: (\[ [^][\n]+ \]) \n )?
|
|
#
|
|
# # blank line after:
|
|
# [\n]{1}
|
|
# """, handle_table, text, flags=re.VERBOSE)
|
|
#
|
|
# print(f'text = """{text}"""');
|
|
#
|
|
# text += """
|
|
# <style>
|
|
# table
|
|
# {
|
|
# border-collapse: collapse;
|
|
# margin: 1em 0;
|
|
# }
|
|
#
|
|
# th, td
|
|
# {
|
|
# border: thin solid grey;
|
|
# padding: 0.5em;
|
|
# }
|
|
# </style>
|
|
# """;
|
|
#
|
|
# with open("test.html", "w") as stream:
|
|
# stream.write(text);
|
|
#
|