import re;
from itertools import product
def parse_colinfo(linesep):
column_info = [];
linesep = linesep.strip();
print(f"linesep = {linesep}");
if linesep.startswith("|"):
linesep = linesep[1:]
print(f"linesep = {linesep}");
if linesep.endswith("|"):
linesep = linesep[:-1]
print(f"linesep = {linesep}");
column_info = [];
for sep in linesep.split("|"):
sep = sep.strip()
print(f'sep = "{sep}"');
if re.match(r"^:-+$", sep):
appendme = {'align': 'left', 'default-tag': 'td'};
elif re.match(r"^-+:$", sep):
appendme = {'align': 'right', 'default-tag': 'td'};
elif re.match(r"^:-+:$", sep):
appendme = {'align': 'center', 'default-tag': 'td'};
elif re.match(r"^-+$", sep):
appendme = {'align': 'default', 'default-tag': 'td'};
elif re.match(r"^:=+$", sep):
appendme = {'align': 'left', 'default-tag': 'th'};
elif re.match(r"^=+:$", sep):
appendme = {'align': 'right', 'default-tag': 'th'};
elif re.match(r"^:=+:$", sep):
appendme = {'align': 'center', 'default-tag': 'th'};
elif re.match(r"^=+$", sep):
appendme = {'align': 'default', 'default-tag': 'th'};
else:
raise SyntaxError("bad seperator!");
print(f'appendme = {appendme}');
column_info.append(appendme);
return column_info;
class State:
def __init__(self, section_tag, default_cell_tag, column_info = []):
self.section_tag = section_tag;
self.default_cell_tag = default_cell_tag;
self.column_info = column_info;
self.already_opened_section = 0;
def do_table_line(state, line):
"""
I'm supposed to return the whole HTML.
Including the
if needed.
"""
print(f'line = "{line}"');
tags = r"(?:colgroup|col|thead|tr|tbody|tfoot)"
passthrough_pattern = fr"(?:<{tags}(?:[\s]+[^>]*)?>|{tags}>)";
start_tag_pattern = fr"<{state.section_tag}(?:[\s]+[^>]*)?>"
open_tr_pattern = r"
]*)?>[\s]*";
open_caption_pattern = r"]*)?>";
cell_delimiter = r"(?:[|]|<(?:tr|th)(?:[\s]+[^<>]*)?>)";
caption_sentinel_pattern = "(?:" + \
'|'.join((cell_delimiter, passthrough_pattern, start_tag_pattern,
open_tr_pattern, open_caption_pattern)) + ")"
already_open_tr = 0;
out = "";
while line:
# Is it whatever our start tag is?
if (m := re.match(start_tag_pattern, line)):
# we'll pass this through, and remember that we don't need to do
# it ourselves
print("found our start tag");
out += m.group(0);
line = line[len(m.group(0)):];
print(f'line = "{line}"');
state.already_opened_section = 1;
continue;
# Is it a
we should just pass through?
if (m := re.match(open_tr_pattern, line)):
# we'll also pass this through, and remember that we don't need
# to do it ourselves.
print("found our tr tag");
if not state.already_opened_section:
out += f"<{state.section_tag}>";
state.already_opened_section = 1;
out += m.group(0);
line = line[len(m.group(0)):];
print(f'line = "{line}"');
already_open_tr = 1;
continue;
# Is it the "" table part?
if (m := re.match(open_caption_pattern, line)):
print(f'found caption tag: "{m.group(0)}"');
out += m.group(0);
line = line[len(m.group(0)):];
print(f'line = "{line}"');
# For the caption, we'll need to keep eating until we see
# the caption sentinel, or end-of-line
while line:
if (m := re.match(r"", line)):
line = line[len(m.group(0)):];
print(f'line = "{line}"');
break;
if (m := re.match(caption_sentinel_pattern, line)):
break;
out += line[0];
line = line[1:];
print(f'line = "{line}"');
out += "";
continue;
# Is it some table part we should just pass through?
if (m := re.match(passthrough_pattern, line)):
print(f'found passthrough tag: "{m.group(0)}"');
out += m.group(0);
line = line[len(m.group(0)):];
print(f'line = "{line}"');
continue;
# If we made it here, it must actually be content
break;
if line:
# If there's anything to read after the table parts, it's gotta be cells
print("new table row");
column_index = 0;
if not state.already_opened_section:
out += f"<{state.section_tag}>";
state.already_opened_section = 1;
if not already_open_tr:
out += "
";
while line:
print("new cell");
tag = "";
print("loading attributes");
attributes = {};
if column_index < len(state.column_info):
align = state.column_info[column_index]['align'];
if align != 'default':
attributes['align'] = align;
print(f'attributes = "{attributes}"');
print("looking for starter");
if (m := re.match(r"[|]", line)):
print("found opening vbar");
if column_index < len(state.column_info):
tag = state.column_info[column_index]['default-tag']
else:
tag = state.default_cell_tag;
print(f'tag = "{tag}"');
line = line[1:];
print(f'line = "{line}"');
elif (m := re.match(r"<([a-z]+)(?:[\s]+([^<>]*))?>", line)):
print("found opening HTML tag");
tag = m.group(1);
print(f'tag = "{tag}"');
raw_attributes = m.group(2);
while raw_attributes:
if (mm := re.match(fr'\s*([a-z0-9]+)\s*=\s*"([^"]+)"', raw_attributes)):
attribute = mm.group(1);
value = mm.group(2);
print(f'attribute = "{attribute}"');
print(f'value = "{value}"');
if attribute in attributes and attribute == "style":
attributes[attribute] += '; ' + value;
else:
attributes[attribute] = value;
print(f'attributes = "{attributes}"');
raw_attributes = raw_attributes[len(mm.group(0)):];
print(f'raw_attributes = "{raw_attributes}"');
else:
print(f'could not parse HTML attributes?! huh?! throwing! ');
raise SyntaxError("could not parse HTML attributes");
line = line[len(m.group(0)):];
print(f'line = "{line}"');
elif (m := re.match(r"<([a-z]+)", line)):
print("found HTML open, but it's incomplete? huh?! throwing!");
raise SyntaxError("could not find '>' for HTML open tag");
elif column_index < len(state.column_info):
print("found nothing, defaulting to column info");
tag = state.column_info[column_index]['default-tag']
print(f'tag = "{tag}"');
else:
print(f"found nothing, defaulting default_tag ({state.default_cell_tag})");
tag = state.default_cell_tag;
print(f'tag = "{tag}"');
print("looking for closer");
content = "";
depth = 0;
while line:
if (m := re.match("(?:\\s|\\w|[.,-])+", line)):
content += m.group(0);
line = line[len(m.group(0)):];
print(f'line = "{line}"');
elif not depth and (m := re.match("<(td|th)", line)):
print(f"found HTML open tag: {m.group(1)}");
break;
elif not depth and (m := re.match("(td|th)>", line)):
print(f"found HTML close tag: {m.group(1)}");
line = line[len(m.group(0)):]
print(f'line = "{line}"');
break;
elif not depth and (m := re.match(r"([|]+)", line)):
print(f'found vbar ("{m.group(1)}")');
num_vbars = len(m.group(1));
if num_vbars > 1:
attributes['colspan'] = num_vbars;
line = line[len(m.group(0)):]
print(f'line = "{line}"');
break;
elif (m := re.match("", line)):
content += m.group(0);
depth += 1;
line = line[len(m.group(0)):]
print(f'line = "{line}"');
print(f'depth = {depth}');
elif (m := re.match("
", line)):
content += m.group(0);
depth -= 1;
if depth < 0: raise SyntaxError("negative depth; bad HTML");
line = line[len(m.group(0)):]
print(f'line = "{line}"');
print(f'depth = {depth}');
# elif (m := re.match(fr"{tags}>", line)):
# # ignore the closers for table parts, no passthrough
# line = line[len(m.group(0)):]
# print(f'line = "{line}"');
elif (m := re.match("<[a-z]+(?:\\s+[^<>]*)?>", line)):
content += m.group(0);
line = line[len(m.group(0)):]
print(f'line = "{line}"');
elif (m := re.match("[a-z]+>", line)):
content += m.group(0);
line = line[len(m.group(0)):]
print(f'line = "{line}"');
elif (m := re.match(r"`[^`\n]+`", line)):
content += m.group(0);
line = line[len(m.group(0)):]
print(f'line = "{line}"');
else:
print(f'line = "{line}"');
assert(not "TODO");
if attributes:
attributes = " ".join(f'{key}="{value}"'
for key, value in sorted(attributes.items()));
cell = f"<{tag} {attributes}> {content} {tag}>";
else:
cell = f"<{tag}> {content} {tag}>";
print(f'cell = "{cell}"');
out += cell;
column_index += 1;
print(f'out = "{out}"');
return out;
def do_table(table_open_tag, header_lines, seperator_line, body_lines, optional_caption):
out = "";
# handle explicit table tag?
if table_open_tag:
out += table_open_tag + "\n";
else:
# otherwise, add a default one:
out += "" + "\n";
state = State(section_tag = "thead", default_cell_tag = "th");
# Process the header lines:
for line in header_lines:
html_table_line = do_table_line(state, line);
out += html_table_line + "\n";
# Handle line seperator:
column_info = parse_colinfo(seperator_line);
# Process the body lines:
for lines in body_lines:
state = State(section_tag = "tbody", \
default_cell_tag = "td", \
column_info = column_info);
for line in lines:
html_table_line = do_table_line(state, line);
out += html_table_line + "\n";
# Consider the optional caption
if optional_caption:
out += f" {optional_caption} \n";
out += "
\n";
for o in out.split("\n"):
print(o);
return "\n\n" + out + "\n\n";
def handle_table(m):
print("handle_table");
matched = m.group(0);
print(f'matched = """{matched}"""');
optional_table_open = m.group(1);
one_or_more_header_lines = m.group(2);
seperator_line = m.group(3);
one_or_more_body_lines = m.group(4);
optional_caption = m.group(5);
assert(seperator_line is not None)
try:
return do_table(
optional_table_open,
one_or_more_header_lines.strip().split("\n"),
seperator_line,
[e.strip().split("\n")
for e in one_or_more_body_lines.strip().split("\n\n")],
optional_caption,
);
except SyntaxError as e:
print(f"caught syntax error: {e}");
print("moving on to next table...");
return m.group(0);
with open("test.md") as stream:
text = stream.read();
# delimiters between cells
delimiter = r"(?:[|]|<(?:tr|th)(?:[\s]+[^<>]*)?>)";
# A row is anything with at least one delimiter
row = fr"(?: .* {delimiter} .*)";
# Most table parts are simple.
table_tags = r"(?:colgroup|col|thead|tr|tbody|tfoot|caption)"
table_part = fr"(?:<{table_tags}(?:\s+[^<>]*)?>|{table_tags}>)";
# captions eat until the next table part or delimiter
# temp = fr"(?: {table_part} | )";
# caption_table_part = fr"(?: (?! (?: {delimiter} | {temp})) .*)"
# caption_table_part = fr"(?: (?!{delimiter}) .*)"
# caption_table_part = fr"(?: [^<>]* )"
caption_table_part = fr"(?: [^<>]*)"
table_part = fr"(?: {table_part} | {caption_table_part})";
# A table line can also be all table parts
row = fr"(?:{row} | {table_part}+)";
# Between the header rows and the body rows there is a line seperator.
seperator_line = r"\s* [|]? \s* [-=:]+ \s* (?: \s* [|] \s* [-=:]* \s* )* \s*"
# Regex for whole table:
for o, c in product((1, 0), repeat=2):
table = fr"""
# two blank lines:
[\n]{{2}}
# optional or required open table tag:
(?:(]*)?>) \n){{{o},1}}
# zero or one or more header rows:
((?: {row} \n){{{1-o},}})
# line seperator:
({seperator_line}) [\n]
# zero or one or more body rows, with empty lines of one:
((?: {row} [\n]{{1,2}}){{{1-c},}})
# optional caption:
(?: \[ ([a-z0-9 "']+) \] \n)?
# optional or required close table tag:
(?:
[\n]){{{c},1}}
# two blank lines:
[\n]{{2}}
""";
text = re.sub(table, handle_table, text, flags=re.VERBOSE)
text += """
""";
with open("test.html", "w") as stream:
stream.write(text);