634 lines
18 KiB
Python
634 lines
18 KiB
Python
|
|
import re;
|
|
|
|
from itertools import product
|
|
|
|
def parse_colinfo(linesep):
|
|
column_info = [];
|
|
|
|
linesep = linesep.strip();
|
|
|
|
print(f"linesep = {linesep}");
|
|
|
|
if linesep.startswith("|"):
|
|
linesep = linesep[1:]
|
|
|
|
print(f"linesep = {linesep}");
|
|
|
|
if linesep.endswith("|"):
|
|
linesep = linesep[:-1]
|
|
|
|
print(f"linesep = {linesep}");
|
|
|
|
column_info = [];
|
|
|
|
for sep in linesep.split("|"):
|
|
sep = sep.strip()
|
|
|
|
print(f'sep = "{sep}"');
|
|
|
|
if re.match(r"^:-+$", sep):
|
|
appendme = {'align': 'left', 'default-tag': 'td'};
|
|
elif re.match(r"^-+:$", sep):
|
|
appendme = {'align': 'right', 'default-tag': 'td'};
|
|
elif re.match(r"^:-+:$", sep):
|
|
appendme = {'align': 'center', 'default-tag': 'td'};
|
|
elif re.match(r"^-+$", sep):
|
|
appendme = {'align': 'default', 'default-tag': 'td'};
|
|
elif re.match(r"^:=+$", sep):
|
|
appendme = {'align': 'left', 'default-tag': 'th'};
|
|
elif re.match(r"^=+:$", sep):
|
|
appendme = {'align': 'right', 'default-tag': 'th'};
|
|
elif re.match(r"^:=+:$", sep):
|
|
appendme = {'align': 'center', 'default-tag': 'th'};
|
|
elif re.match(r"^=+$", sep):
|
|
appendme = {'align': 'default', 'default-tag': 'th'};
|
|
else:
|
|
appendme = {'align': 'default', 'default-tag': 'td'};
|
|
|
|
print(f'appendme = {appendme}');
|
|
|
|
column_info.append(appendme);
|
|
|
|
return column_info;
|
|
|
|
class State:
|
|
def __init__(self, section_tag, default_cell_tag, column_info = []):
|
|
self.section_tag = section_tag;
|
|
|
|
self.default_cell_tag = default_cell_tag;
|
|
|
|
self.column_info = column_info;
|
|
|
|
self.already_opened_section = 0;
|
|
|
|
def do_table_line(state, line):
|
|
"""
|
|
I'm supposed to return the whole HTML.
|
|
Including the <tr> if needed.
|
|
"""
|
|
print("do_table_line");
|
|
|
|
print(f'line = "{line}"');
|
|
|
|
tags = r"(?:colgroup|col|thead|tr|tbody|tfoot)"
|
|
|
|
passthrough_pattern = fr"(?:<{tags}(?:[\s]+[^>]*)?>|</{tags}>)";
|
|
|
|
start_tag_pattern = fr"<(thead|tbody|tfoot)(?:[\s]+[^>]*)?>"
|
|
|
|
open_tr_pattern = r"<tr(?:[\s]+[^<>]*)?>[\s]*";
|
|
|
|
open_caption_pattern = r"<caption(?:[\s]+[^<>]*)?>";
|
|
|
|
cell_delimiter = r"(?:[|]|<(?:tr|th)(?:[\s]+[^<>]*)?>)";
|
|
|
|
caption_sentinel_pattern = "(?:" + \
|
|
'|'.join((cell_delimiter, passthrough_pattern, start_tag_pattern,
|
|
open_tr_pattern, open_caption_pattern, r'</?table.*')) + ")"
|
|
|
|
already_open_tr = 0;
|
|
|
|
out = "";
|
|
|
|
while line:
|
|
# Is it whatever our start tag is?
|
|
if (m := re.match(start_tag_pattern, line)):
|
|
# we'll pass this through, and remember that we don't need to do
|
|
# it ourselves also possibly close previous section also change
|
|
# "section_tag" to be thead
|
|
print(f"found our start tag: '{m.group(0)}'");
|
|
|
|
# if we're already open, close whatever that was
|
|
if state.already_opened_section:
|
|
out += f'</{state.section_tag}>' + "\n";
|
|
|
|
out += m.group(0) + "\n";
|
|
|
|
line = line[len(m.group(0)):];
|
|
|
|
print(f'line = "{line}"');
|
|
|
|
state.section_tag = m.group(1);
|
|
state.already_opened_section = 1;
|
|
|
|
continue;
|
|
|
|
# Is it a <tr> we should just pass through?
|
|
if (m := re.match(open_tr_pattern, line)):
|
|
|
|
# we'll also pass this through, and remember that we don't need
|
|
# to do it ourselves.
|
|
|
|
print("found our tr tag");
|
|
|
|
if not state.already_opened_section:
|
|
out += f"<{state.section_tag}>";
|
|
state.already_opened_section = 1;
|
|
|
|
out += m.group(0) + "\n";
|
|
|
|
line = line[len(m.group(0)):];
|
|
|
|
print(f'line = "{line}"');
|
|
|
|
already_open_tr = 1;
|
|
|
|
continue;
|
|
|
|
# Is it the "<caption>" table part?
|
|
if (m := re.match(open_caption_pattern, line)):
|
|
|
|
print(f'found caption tag: "{m.group(0)}"');
|
|
|
|
out += m.group(0);
|
|
|
|
line = line[len(m.group(0)):];
|
|
|
|
print(f'line = "{line}"');
|
|
|
|
# For the caption, we'll need to keep eating until we see
|
|
# the caption sentinel, or end-of-line
|
|
|
|
while line:
|
|
if (m := re.match(r"</caption>", line)):
|
|
line = line[len(m.group(0)):];
|
|
|
|
print(f'line = "{line}"');
|
|
|
|
break;
|
|
|
|
if (m := re.match(caption_sentinel_pattern, line)):
|
|
break;
|
|
|
|
out += line[0];
|
|
|
|
line = line[1:];
|
|
|
|
print(f'line = "{line}"');
|
|
|
|
out += "</caption>" + "\n";
|
|
|
|
continue;
|
|
|
|
# Is it some table part we should just pass through?
|
|
if (m := re.match(passthrough_pattern, line)):
|
|
print(f'found passthrough tag: "{m.group(0)}"');
|
|
|
|
out += m.group(0) + "\n";
|
|
|
|
line = line[len(m.group(0)):];
|
|
|
|
print(f'line = "{line}"');
|
|
|
|
continue;
|
|
|
|
# If we made it here, it must actually be content
|
|
break;
|
|
|
|
if line:
|
|
# If there's anything to read after the table parts, it's gotta be cells
|
|
|
|
print("new table row");
|
|
|
|
column_index = 0;
|
|
|
|
if not state.already_opened_section:
|
|
out += f"<{state.section_tag}>" + "\n";
|
|
state.already_opened_section = 1;
|
|
|
|
if not already_open_tr:
|
|
out += "<tr>" + "\n";
|
|
|
|
while line:
|
|
print("new cell");
|
|
|
|
tag = "";
|
|
|
|
print("loading attributes");
|
|
|
|
attributes = {};
|
|
|
|
if column_index < len(state.column_info):
|
|
align = state.column_info[column_index]['align'];
|
|
|
|
if align != 'default':
|
|
attributes['style'] = f'text-align: {align}';
|
|
|
|
print(f'attributes = "{attributes}"');
|
|
|
|
print("looking for starter");
|
|
|
|
if (m := re.match(r"[|]", line)):
|
|
print("found opening vbar");
|
|
|
|
if column_index < len(state.column_info):
|
|
tag = state.column_info[column_index]['default-tag']
|
|
else:
|
|
tag = state.default_cell_tag;
|
|
|
|
print(f'tag = "{tag}"');
|
|
|
|
line = line[1:];
|
|
|
|
print(f'line = "{line}"');
|
|
elif (m := re.match(r"<(th|td)(?:[\s]+([^<>]*))?>", line)):
|
|
print("found opening HTML tag");
|
|
|
|
tag = m.group(1);
|
|
|
|
print(f'tag = "{tag}"');
|
|
|
|
raw_attributes = m.group(2);
|
|
|
|
while raw_attributes:
|
|
if (mm := re.match(fr'\s*([a-z0-9]+)\s*=\s*"([^"]+)"', raw_attributes)):
|
|
attribute = mm.group(1);
|
|
|
|
value = mm.group(2);
|
|
|
|
print(f'attribute = "{attribute}"');
|
|
|
|
print(f'value = "{value}"');
|
|
|
|
if attribute in attributes and attribute == "style":
|
|
attributes[attribute] += '; ' + value;
|
|
else:
|
|
attributes[attribute] = value;
|
|
|
|
print(f'attributes = "{attributes}"');
|
|
|
|
raw_attributes = raw_attributes[len(mm.group(0)):];
|
|
|
|
print(f'raw_attributes = "{raw_attributes}"');
|
|
else:
|
|
print(f'could not parse HTML attributes?! huh?! throwing! ');
|
|
|
|
raise SyntaxError("could not parse HTML attributes");
|
|
|
|
line = line[len(m.group(0)):];
|
|
|
|
print(f'line = "{line}"');
|
|
elif (m := re.match(r"<(th|td)", line)):
|
|
print("found HTML open, but it's incomplete? huh?! throwing!");
|
|
|
|
raise SyntaxError("could not find '>' for HTML open tag");
|
|
elif column_index < len(state.column_info):
|
|
print("found nothing, defaulting to column info");
|
|
|
|
tag = state.column_info[column_index]['default-tag']
|
|
|
|
print(f'tag = "{tag}"');
|
|
|
|
print(f'line = "{line}"');
|
|
else:
|
|
print(f"found nothing, defaulting default_tag ({state.default_cell_tag})");
|
|
|
|
tag = state.default_cell_tag;
|
|
|
|
print(f'tag = "{tag}"');
|
|
|
|
print(f'line = "{line}"');
|
|
|
|
print("looking for closer");
|
|
|
|
content = "";
|
|
|
|
depth = 0;
|
|
|
|
while line:
|
|
if (m := re.match("(?:\\s|\\w|[.,-])+", line)):
|
|
content += m.group(0);
|
|
line = line[len(m.group(0)):];
|
|
print(f'line = "{line}"');
|
|
elif not depth and (m := re.match("<(td|th)", line)):
|
|
print(f"found HTML open tag: {m.group(1)}");
|
|
break;
|
|
elif not depth and (m := re.match("</(td|th)>", line)):
|
|
print(f"found HTML close tag: {m.group(1)}");
|
|
line = line[len(m.group(0)):]
|
|
print(f'line = "{line}"');
|
|
break;
|
|
elif not depth and (m := re.match(r"([|]+)", line)):
|
|
print(f'found vbar ("{m.group(1)}")');
|
|
|
|
num_vbars = len(m.group(1));
|
|
|
|
if num_vbars > 1:
|
|
attributes['colspan'] = num_vbars;
|
|
|
|
line = line[len(m.group(0)):]
|
|
|
|
print(f'line = "{line}"');
|
|
break;
|
|
elif (m := re.match("<table>", line)):
|
|
content += m.group(0);
|
|
depth += 1;
|
|
line = line[len(m.group(0)):]
|
|
print(f'line = "{line}"');
|
|
print(f'depth = {depth}');
|
|
elif (m := re.match("</table>", line)):
|
|
content += m.group(0);
|
|
if depth > 0: depth -= 1;
|
|
line = line[len(m.group(0)):]
|
|
print(f'line = "{line}"');
|
|
print(f'depth = {depth}');
|
|
elif (m := re.match("<[a-z]+(?:\\s+[^<>]*)?>", line)):
|
|
content += m.group(0);
|
|
line = line[len(m.group(0)):]
|
|
print(f'line = "{line}"');
|
|
elif (m := re.match("</[a-z]+>", line)):
|
|
content += m.group(0);
|
|
line = line[len(m.group(0)):]
|
|
print(f'line = "{line}"');
|
|
elif (m := re.match(r"`[^`\n]+`", line)):
|
|
content += m.group(0);
|
|
line = line[len(m.group(0)):]
|
|
print(f'line = "{line}"');
|
|
else:
|
|
print(f'line = "{line}"');
|
|
|
|
assert(not "TODO");
|
|
|
|
if attributes:
|
|
attributes = " ".join(f'{key}="{value}"'
|
|
for key, value in sorted(attributes.items()));
|
|
|
|
cell = f"<{tag} {attributes}> {content} </{tag}>";
|
|
else:
|
|
cell = f"<{tag}> {content} </{tag}>";
|
|
|
|
print(f'cell = "{cell}"');
|
|
|
|
out += cell;
|
|
|
|
column_index += 1;
|
|
|
|
# end the row of content
|
|
out += "\n";
|
|
|
|
# close tr on its own line
|
|
out += "</tr>" + "\n";
|
|
|
|
print(f'out = "{out}"');
|
|
|
|
return out;
|
|
|
|
# def do_table(table_open_tag, header_lines, seperator_line, body_lines, optional_caption):
|
|
#
|
|
# # handle explicit table tag?
|
|
# if table_open_tag:
|
|
# open_tag = table_open_tag + "\n";
|
|
# else:
|
|
# # otherwise, add a default one:
|
|
# open_tag = "<table>" + "\n";
|
|
#
|
|
# inner = "";
|
|
#
|
|
# state = State(section_tag = "thead", default_cell_tag = "th");
|
|
#
|
|
# # Process the header lines:
|
|
# for line in header_lines:
|
|
# inner = do_table_line(state, line);
|
|
#
|
|
# if state.already_opened_section:
|
|
# inner += f"</{state.section_tag}>" "\n";
|
|
#
|
|
# # Handle line seperator:
|
|
# column_info = parse_colinfo(seperator_line);
|
|
#
|
|
# # Process the body lines:
|
|
# for lines in body_lines:
|
|
# state = State(section_tag = "tbody", \
|
|
# default_cell_tag = "td", \
|
|
# column_info = column_info);
|
|
#
|
|
# for line in lines:
|
|
# inner += do_table_line(state, line);
|
|
#
|
|
# if state.already_opened_section:
|
|
# inner += f"</{state.section_tag}>" "\n";
|
|
#
|
|
# # Consider the optional caption.
|
|
# # If it happens, it goes before everything else
|
|
# if optional_caption:
|
|
# inner = f"<caption> {optional_caption} </caption>\n" + inner;
|
|
#
|
|
# close_tag = "</table>\n";
|
|
#
|
|
# for o in inner.split("\n"):
|
|
# print(o);
|
|
#
|
|
# return "\n\n" + open_tag + inner + close_tag + "\n\n";
|
|
|
|
def handle_table(m):
|
|
print("handle_table");
|
|
|
|
matched = m.group(0);
|
|
|
|
optional_table_open = m.group(1);
|
|
|
|
one_or_more_header_lines = m.group(2);
|
|
|
|
header_lines = one_or_more_header_lines.strip().split("\n")
|
|
|
|
seperator_line = m.group(3);
|
|
|
|
one_or_more_body_lines = m.group(4);
|
|
|
|
body_lines = [e.strip().split("\n") for e in one_or_more_body_lines.strip().split("\n\n")]
|
|
|
|
optional_caption = m.group(5);
|
|
|
|
assert(seperator_line is not None)
|
|
|
|
try:
|
|
# handle explicit table tag?
|
|
if optional_table_open:
|
|
open_tag = optional_table_open + "\n";
|
|
else:
|
|
# otherwise, add a default one:
|
|
open_tag = "<table>" + "\n";
|
|
|
|
inner = "";
|
|
|
|
state = State(section_tag = "thead", default_cell_tag = "th");
|
|
|
|
# Process the header lines:
|
|
for line in header_lines:
|
|
inner = do_table_line(state, line);
|
|
|
|
if state.already_opened_section:
|
|
inner += f"</{state.section_tag}>" "\n";
|
|
|
|
# Handle line seperator:
|
|
column_info = parse_colinfo(seperator_line);
|
|
|
|
# Process the body lines:
|
|
for lines in body_lines:
|
|
state = State(section_tag = "tbody", \
|
|
default_cell_tag = "td", \
|
|
column_info = column_info);
|
|
|
|
for line in lines:
|
|
inner += do_table_line(state, line);
|
|
|
|
if state.already_opened_section:
|
|
inner += f"</{state.section_tag}>" "\n";
|
|
|
|
# Consider the optional caption.
|
|
# If it happens, it goes before everything else
|
|
if optional_caption:
|
|
inner = f"<caption> {optional_caption} </caption>\n" + inner;
|
|
|
|
close_tag = "</table>\n";
|
|
|
|
return "\n\n" + open_tag + inner + close_tag + "\n\n";
|
|
except SyntaxError as e:
|
|
print(f"caught syntax error: {e}");
|
|
print("moving on to next table...");
|
|
return m.group(0);
|
|
|
|
def handle_table_no_sep(m):
|
|
print("handle_table_no_sep");
|
|
|
|
matched = m.group(0);
|
|
|
|
print(f'matched = """{matched}"""');
|
|
|
|
table_open_tag = m.group(1) + "\n";
|
|
|
|
one_or_more_body_lines = m.group(2);
|
|
|
|
body_lines = [e.strip().split("\n") for e in one_or_more_body_lines.strip().split("\n\n")]
|
|
|
|
optional_caption = m.group(3);
|
|
|
|
try:
|
|
inner = "";
|
|
|
|
# Process the body lines:
|
|
for lines in body_lines:
|
|
state = State(section_tag = "tbody", \
|
|
default_cell_tag = "td", \
|
|
column_info = []);
|
|
|
|
for line in lines:
|
|
inner += do_table_line(state, line);
|
|
|
|
if state.already_opened_section:
|
|
inner += f"</{state.section_tag}>" "\n";
|
|
|
|
# Consider the optional caption.
|
|
# If it happens, it goes before everything else
|
|
if optional_caption:
|
|
inner = f"<caption> {optional_caption} </caption>\n" + inner;
|
|
|
|
table_close_tag = "</table>\n";
|
|
|
|
return "\n\n" + table_open_tag + inner + table_close_tag + "\n\n";
|
|
|
|
except SyntaxError as e:
|
|
print(f"caught syntax error: {e}");
|
|
print("moving on to next table...");
|
|
return m.group(0);
|
|
|
|
with open("test.md") as stream:
|
|
text = stream.read();
|
|
|
|
# delimiters between cells
|
|
delimiter = r"(?:[|]|<(?:tr|th)(?:[\s]+[^<>]*)?>)";
|
|
|
|
# A row is anything with at least one delimiter
|
|
row = fr"(?: .* {delimiter} .*)";
|
|
|
|
# Most table parts are simple.
|
|
table_tags = r"(?:colgroup|col|thead|tr|tbody|tfoot|caption)"
|
|
table_part = fr"(?:<{table_tags}(?:\s+[^<>]*)?>|</{table_tags}>)";
|
|
|
|
# captions eat until the next table part or delimiter
|
|
# temp = fr"(?: {table_part} | <caption>)";
|
|
# caption_table_part = fr"(?: <caption> (?! (?: {delimiter} | {temp})) .*)"
|
|
# caption_table_part = fr"(?: <caption> (?!{delimiter}) .*)"
|
|
# caption_table_part = fr"(?: <caption> [^<>]* </caption>)"
|
|
caption_table_part = fr"(?: <caption> [^<>]*)"
|
|
table_part = fr"(?: {table_part} | {caption_table_part})";
|
|
|
|
# A table line can also be all table parts
|
|
row = fr"(?:{row} | {table_part}+)";
|
|
|
|
# Between the header rows and the body rows there is a line seperator.
|
|
seperator_line = r"\s* [|]? \s* [-=:]+ \s* (?: \s* [|] \s* [-=:]* \s* )* \s*"
|
|
|
|
# Regex for whole table:
|
|
for o, c in product((1, 0), repeat=2):
|
|
table = fr"""
|
|
# two blank lines:
|
|
[\n]{{2}}
|
|
|
|
# optional or required open table tag:
|
|
(?:(<table[\s]+[^<>]*markdown=(?:"1"|'1'|1)[^<>]*>) \n){{{o},1}}
|
|
|
|
# zero or one or more header rows:
|
|
((?: {row} \n){{{1-o},}})
|
|
|
|
# line seperator:
|
|
({seperator_line}) [\n]
|
|
|
|
# zero or one or more body rows, with empty lines of one:
|
|
((?: {row} [\n]{{1,2}}){{{1-c},}})
|
|
|
|
# optional caption:
|
|
(?: \[ ([a-z0-9 "']+) \] \n)?
|
|
|
|
# optional or required close table tag:
|
|
(?: </table> [\n]){{{c},1}}
|
|
|
|
# two blank lines:
|
|
[\n]{{2}}
|
|
""";
|
|
|
|
text = re.sub(table, handle_table, text, flags=re.VERBOSE)
|
|
|
|
table = fr"""
|
|
# two blank lines:
|
|
[\n]{{2}}
|
|
|
|
# required open table tag:
|
|
(?:(<table[\s]+[^<>]*markdown=(?:"1"|'1'|1)[^<>]*>) \n)
|
|
|
|
# one or more body rows, with empty lines of one:
|
|
((?: {row} [\n]{{1,2}})+)
|
|
|
|
# optional caption:
|
|
(?: \[ ([a-z0-9 "']+) \] \n)?
|
|
|
|
# required close table tag:
|
|
(?: </table> [\n])
|
|
|
|
# two blank lines:
|
|
[\n]{{2}}
|
|
""";
|
|
|
|
text = re.sub(table, handle_table_no_sep, text, flags=re.VERBOSE)
|
|
|
|
text += """
|
|
<style>
|
|
table
|
|
{
|
|
border-collapse: collapse;
|
|
margin: 1em 0;
|
|
}
|
|
|
|
th, td
|
|
{
|
|
border: thin solid grey;
|
|
padding: 0.5em;
|
|
}
|
|
</style>
|
|
|
|
""";
|
|
|
|
with open("test.html", "w") as stream:
|
|
stream.write(text);
|
|
|