zanders-php-markdown/test.py


import re;

def find_closer(row):

		# if (m := re.match(r"([^|<]*)([|]+)", row)):
		# 	print(f'found vbar ("{m.group(2)}")');

		# 	content = m.group(1).strip();

		# 	print(f'content = "{content}"');

		# 	num_vbars = len(m.group(2));

		# 	if num_vbars > 1:
		# 		attributes['colspan'] = num_vbars;

		# 	row = row[len(m.group(0)):];

		# 	print(f'row = "{row}"');
		# elif (m := re.match(r"([^|<]*)</(td|th|tr)>", row)):
		# 	print(f"found close HTML tag");

		# 	content = m.group(1).strip();

		# 	print(f'content = "{content}"');

		# 	row = row[len(m.group(0)):];

		# 	print(f'row = "{row}"');
		# elif (m := re.match(r"([^|<]*)<(th|td) ", row)):
		# 	print(f"found open HTML tag ({m.group(2)})");

		# 	content = m.group(1).strip();

		# 	print(f'content = "{content}"');

		# 	row = row[len(m.group(1)):];

		# 	print(f'row = "{row}"');
		# elif (m := re.match(r"([^<|]*)$", row)):
		# 	print(f"found eol");

		# 	content = m.group(1).strip();

		# 	print(f'content = "{content}"');

		# 	row = row[len(m.group(0)):];

		# 	print(f'row = "{row}"');
		# else:
		# 	print(f"could not find valid closer? huh?!? throwing!");

		# 	raise SyntaxError("could not valid closer for cell");

	if depth:
		assert(not "TODO");

	return content, row;

# def do_row(row, column_info, default_tag = "td"):
# 	print(f'row = "{row}"');
#
# 	row = row.lstrip();
#
# 	out = "";
#
# 	column_index = 0;
#
# 	while row:
# 		print("new cell");
#
# 		tag = "";
#
# 		attributes = {};
#
# 		if column_index < len(column_info):
# 			align = column_info[column_index]['align'];
#
# 			if align != 'default':
# 				attributes['align'] = align;
#
# 				print(f'attributes = "{attributes}"');
#
# 		print("looking for starter");
#
# 		if (m := re.match(r"^[|]", row)):
# 			print("found opening vbar");
#
# 			if column_index < len(column_info):
# 				tag = column_info[column_index]['default-tag']
# 			else:
# 				tag = default_tag;
#
# 			print(f'tag = "{tag}"');
#
# 			row = row[1:];
#
# 			print(f'row = "{row}"');
#
# 		elif (m := re.match(r"^<([a-z]+)(?: +([^<>]*))?>", row)):
# 			print("found opening HTML tag");
#
# 			tag = m.group(1);
#
# 			print(f'tag = "{tag}"');
#
# 			raw_attributes = m.group(2);
#
# 			while raw_attributes:
# 				if (mm := re.match(f'\s*([a-z0-9]+)\s*=\s*"([^"]+)"', raw_attributes)):
# 					attribute = mm.group(1);
#
# 					value = mm.group(2);
#
# 					print(f'attribute = "{attribute}"');
#
# 					print(f'value = "{value}"');
#
# 					if attribute in attributes and attribute == "style":
# 						attributes[attribute] += '; ' + value;
# 					else:
# 						attributes[attribute] = value;
#
# 					print(f'attributes = "{attributes}"');
#
# 					raw_attributes = raw_attributes[len(mm.group(0)):];
#
# 					print(f'raw_attributes = "{raw_attributes}"');
# 				else:
# 					print(f'could not parse HTML attributes?! huh?! throwing! ');
#
# 					raise SyntaxError("could not parse HTML attributes");
#
# 			row = row[len(m.group(0)):];
#
# 			print(f'row = "{row}"');
# 		elif (m := re.match(r"^<([a-z]+)", row)):
# 			print("found HTML open, but it's incomplete? huh?! throwing!");
#
# 			raise SyntaxError("could not find '>' for HTML open tag");
# 		elif column_index < len(column_info):
# 			print("found nothing, defaulting to column info");
#
# 			tag = column_info[column_index]['default-tag']
#
# 			print(f'tag = "{tag}"');
# 		else:
# 			print(f"found nothing, defaulting default_tag ({default_tag})");
#
# 			tag = default_tag;
#
# 			print(f'tag = "{tag}"');
#
# 		print("looking for closer");
#
# 		content = "";
#
# 		depth = 0;
#
# 		while row:
# 			if (m := re.match("\s+", row)):
# 				content += m.group(0);
# 				row = row[len(m.group(0)):];
# 				print(f'row = "{row}"');
# 			elif (m := re.match("\w+", row)):
# 				content += m.group(0);
# 				row = row[len(m.group(0)):];
# 				print(f'row = "{row}"');
# 			elif (m := re.match("[.,]+", row)):
# 				content += m.group(0);
# 				row = row[len(m.group(0)):];
# 				print(f'row = "{row}"');
# 			elif not depth and (m := re.match("<(td|tr)", row)):
# 				print("found HTML open tag: {m.group(1)}");
# 				break;
# 			elif not depth and (m := re.match("</(td|tr)>", row)):
# 				print("found HTML close tag: {m.group(1)}");
# 				row = row[len(m.group(0)):]
# 				print(f'row = "{row}"');
# 				break;
# 			elif not depth and (m := re.match(r"([|]+)", row)):
# 				print(f'found vbar ("{m.group(1)}")');
#
# 				num_vbars = len(m.group(1));
#
# 				if num_vbars > 1:
# 					attributes['colspan'] = num_vbars;
#
# 				row = row[len(m.group(0)):]
# 				print(f'row = "{row}"');
#
# 				break;
# 			elif (m := re.match("<table>", row)):
# 				content += m.group(0);
# 				depth += 1;
# 				row = row[len(m.group(0)):]
# 				print(f'row = "{row}"');
# 				print(f'depth = {depth}');
# 			elif (m := re.match("</table>", row)):
# 				content += m.group(0);
# 				depth -= 1;
# 				row = row[len(m.group(0)):]
# 				print(f'row = "{row}"');
# 				print(f'depth = {depth}');
# 			elif (m := re.match("<[a-z]+(?:\s+[^<>]*)?>", row)):
# 				content += m.group(0);
# 				row = row[len(m.group(0)):]
# 				print(f'row = "{row}"');
# 			elif (m := re.match("</[a-z]+>", row)):
# 				content += m.group(0);
# 				row = row[len(m.group(0)):]
# 				print(f'row = "{row}"');
# 			elif (m := re.match(r"`[^`\n]+`", row)):
# 				assert(not "TODO");
# 			else:
# 				print(f'row = "{row}"');
# 				assert(not "TODO");
#
# 		if attributes:
# 			attributes = " ".join(f'{key}="{value}"'
# 				for key, value in sorted(attributes.items()));
#
# 			cell = f"<{tag} {attributes}> {content} </{tag}>";
# 		else:
# 			cell = f"<{tag}> {content} </{tag}>";
#
# 		print(f'cell = "{cell}"');
#
# 		out += cell;
#
# 		column_index += 1;
#
# 	print(f'out = "{out}"');
#
# 	return out;

def parse_colinfo(linesep):
	column_info = [];

	linesep = linesep.strip();

	print(f"linesep = {linesep}");

	if linesep.startswith("|"):
		linesep = linesep[1:]

		print(f"linesep = {linesep}");

	if linesep.endswith("|"):
		linesep = linesep[:-1]

		print(f"linesep = {linesep}");

	column_info = [];

	for sep in linesep.split("|"):
		sep = sep.strip()

		print(f'sep = "{sep}"');

		if re.match(r"^:-+$", sep):
			appendme = {'align': 'left', 'default-tag': 'td'};
		elif re.match(r"^-+:$", sep):
			appendme = {'align': 'right', 'default-tag': 'td'};
		elif re.match(r"^:-+:$", sep):
			appendme = {'align': 'center', 'default-tag': 'td'};
		elif re.match(r"^-+$", sep):
			appendme = {'align': 'default', 'default-tag': 'td'};
		elif re.match(r"^:=+$", sep):
			appendme = {'align': 'left', 'default-tag': 'th'};
		elif re.match(r"^=+:$", sep):
			appendme = {'align': 'right', 'default-tag': 'th'};
		elif re.match(r"^:=+:$", sep):
			appendme = {'align': 'center', 'default-tag': 'th'};
		elif re.match(r"^=+$", sep):
			appendme = {'align': 'default', 'default-tag': 'th'};
		else:
			assert(not "TODO");

		print(f'appendme = {appendme}');

		column_info.append(appendme);

	return column_info;

def do_table_row():

    # we can eat zero or more table parts, which special handling for "<caption>"

    # we may find the end of the line after all table parts have run

    # when if we find something that's not a table part, it's gotta be content

    # start reading through the cells, building them up

    assert(not "TODO");

def do_table(header, linesep, body):

    assert(not "TODO");

	# passthroughs = [
	# 	r"<colgroup(?:\s+.*)?>",
	# 	r"</colgroup>",
	# 	r"<col(?:\s+.*)?>",
	# 	r"</col>",
	# 	r"<tfoot>",
	# 	r"</tfoot>",
	# ];

	# output_table = "<table>";

	# inside_thead = 0;
	# inside_tr = 0;

	# print("processing header rows");

	# for row in header.split("\n"):
	# 	print(f'row = "{row}"');

	# 	while row:
	# 		if (m := re.match(r'\s+', row)):
	# 			row = row[len(m.group(0)):];

	# 			print(f'row = "{row}"');

	# 			continue;

	# 		for passthrough in passthroughs:
	# 			if row.startswith(passthrough):
	# 				assert(not "TODO");

	# 				continue;

	# 		# explicit caption with optional attributes
	# 		if (m := re.match(r"<caption(?:\s+([^<>\n]*))?>([^\n]+)</caption>", row)):
	# 			print(f'found explicit caption tag: "{m.group(0)}"');

	# 			output_table += m.group(0);

	# 			row = row[len(m.group(0)):];

	# 			print(f'row = "{row}"');

	# 			wrote_thead = 1;

	# 			continue;

	# 		# send thead through, but remember we shouldn't do one outselves
	# 		if (m := re.match(r'<thead(?:\s+[^<>\n]*)?>', row)):
	# 			print(f'found explicit thead tag: "{m.group(0)}"');

	# 			output_table += m.group(0);

	# 			inside_thead = 1;

	# 			row = row[len(m.group(0)):];

	# 			print(f'row = "{row}"');

	# 			continue;
	#
	# 		# does this row have an explicit HTML tag?
	# 		if (m := re.match(r"<tr(?:\s+[^<>\n]*)?>", row)):
	# 			print(f'found explicit tr tag: "{m.group(0)}"');

	# 			output_table += m.group(0);

	# 			inside_tr = 1;

	# 			row = row[len(m.group(0)):];

	# 			print(f'row = "{row}"');

	# 			continue;

	# 		print("thead content");
	#
	# 		if not inside_thead:
	# 			output_table += "<thead>";

	# 			wrote_thead = 1;
	#
	# 		if not inside_tr:
	# 			output_table += "<tr>";

	# 		# parse row line
	# 		html_row = do_row(row, [], default_tag = "th");
	#
	# 		print(f'html_row = "{html_row}"');

	# 		output_table += html_row;

	# 		output_table += "</tr>";

	# 		inside_tr = 0;

	# 		break;

	# print(f'output_table = "{output_table}"');

	# column_info = parse_colinfo(linesep);

	# first_tbody = 1;

	# inside_tbody = 0;
	# inside_tr = 0;

	# for row in body.strip().split('\n'):
	# 	print(f'row = "{row}"');

	# 	if not row.strip():
	# 		print("blank line; new tbody");

	# 		inside_tbody = 0;

	# 		continue;

	# 	while row:
	# 		if (m := re.match(r'\s+', row)):
	# 			row = row[len(m.group(0)):];

	# 			print(f'row = "{row}"');

	# 			continue;

	# 		for passthrough in passthroughs:
	# 			if row.startswith(passthrough):
	# 				assert(not "TODO");

	# 				continue;

	# 		# explicit caption with optional attributes
	# 		if (m := re.match(r"<caption(?:\s+([^<>\n]*))?>([^\n]+)</caption>", row)):
	# 			print(f'found explicit caption tag: "{m.group(0)}"');

    #             # caption eat content until tablepart or delimitor

    #             # we don't eat either the tablepart of the delimitor, let
    #             # whatever future logic process that

    #             assert(not "TODO");
	# 			# output_table += m.group(0);

	# 			# row = row[len(m.group(0)):];

	# 			# print(f'row = "{row}"');

	# 			# wrote_thead = 1;

	# 			# continue;

	# 		# explicit HTML tbody. pass it through and take note
	# 		if (m := re.match(r'<tbody(?:\s+.*)?>', row)):
	# 			print(f'found explicit tbody tag: "{m.group(0)}"');

	# 			output_table += m.group(0);

	# 			inside_tbody = 1;

	# 			row = row[len(m.group(0)):];

	# 			print(f'row = "{row}"');

	# 			wrote_thead = 1;

	# 			continue;

	# 		# does this row have an explicit HTML tag?
	# 		if (m := re.match("<tr(?:\s+.*)?>", row)):
	# 			print(f'found explicit tr tag: "{m.group(0)}"');

	# 			output_table += m.group(0);

	# 			inside_tr = 1;

	# 			row = row[len(m.group(0)):];

	# 			print(f'row = "{row}"');

	# 			continue;

	# 		print("tbody content");

	# 		if not inside_tbody:
	# 			output_table += "<tbody>";
	#
	# 		if not inside_tr:
	# 			output_table += "<tr>";

	# 		html_row = do_row(row, column_info, default_tag = "td");
	#
	# 		print(f'html_row = "{html_row}"');

	# 		output_table += html_row;

	# 		output_table += "</tr>";

	# 		inside_tr = 0;

	# 		break;

	# # if (caption := mm.group(4)):
	# # 	output_table += f'<caption> {caption} </caption>'

	# # 	assert(not "CHECK");

	# output_table += "</table>";

	# return "\n\n" + output_table + "\n\n";

def handle_table(m):
	header = m.group(1);
	linesep = m.group(2);
	body = m.group(3);

	try:
		return "<table>" + do_table(header, linesep, body) + "</table>";
	except SyntaxError as e:
		print(f"caught syntax error: {e}");
		print("moving on to next table...");
		return m.group(0);

# def handle_table2(m):
# 	table_attributes = m.group(1) or "";
# 	header = m.group(2);
# 	linesep = m.group(3);
# 	body = m.group(4);
#
# 	try:
# 		return f"<table {table_attributes}>" + do_table(header, linesep, body) + "</table>";
# 	except SyntaxError as e:
# 		print(f"caught syntax error: {e}");
# 		print("moving on to next table...");
# 		return m.group(0);

with open("test.txt") as stream:
	text = stream.read();

# delimiters between cells
delimiter = r"(?: [|] | <tr> | <th>)";

# A row is anything with at least one delimiter
row = fr"(?: .* {delimiter} .*)";

# Most table parts are simple.
table_part = "(?: <colgroup> | <col> | <thead> | <tbody> | <tfoot> | </caption>)";

# captions eat until the next table part or delimiter
temp = fr"(?: {table_part} | <caption>)";
# caption_table_part = fr"(?: <caption> (?! (?: {delimiter} | {temp})) .*)"
# caption_table_part = fr"(?: <caption> (?!{delimiter}) .*)"
caption_table_part = fr"(?: <caption> [^<>]* </caption>)"
table_part = fr"(?: {table_part} | {caption_table_part})";

# A table line can also be all table parts
row = fr"(?:{row} | {table_part}+)";

# Between the header rows and the body rows there is a line seperator.
seperator_line = r"\s* [|]? \s* [-=:]+ \s* (?: \s* [|] \s* [-=:]* \s* )* \s*"

# Regex for whole table:
table = fr"""
    # two blank lines:
    [\n]{2}

    # optional open table tag:
    (?: (<table>)? \n)

    # one or more header rows:
    ((?: {row} \n{{1,2}}))+

    # line seperator:
    ({seperator_line}) \n

    # one or more body rows:
    ((?: {row} \n{{1,2}}))+

    # optional caption:
    (?: \[ ([a-z0-9 "']+) \] \n)?

    # optional close table tag:
    (?: (</table>)? \n)

    # two blank lines:
    [\n]{2}
""";


print(table);

text = re.sub(table, handle_table, text, flags=re.VERBOSE)

#
#
#
# text = re.sub(r"""
# 	# blank line before:
# 	[\n]{1}
#
# 	# header:
# 	( (?: [^\n]+ \n)+ )
#
# 	# seperator line:
# 	(\s* [|]? \s* [-=:]+ \s* (?: \s* [|] \s* [-=:]* \s* )* \s*) \n
#
# 	# body, which is nothing but table parts, or contains at least one
#
#     # delimiter ("|", "<td>", "<th>"), or a line
#
#     # "<caption>" is special table tag, that may contain anything up to:
#         # the end of line
#         # other tableparts
#         # explicit start of cell
#
# 	((?:
#         # It contains a deliminator OR it's enitrely table parts
#         ( .* ("|" | "<tr>" | "<th>" ) .*)
#         | ( "<colgroup>"
#           | "<col>"
#           | "<thead>"
#           | "<tbody>"
#           | "<tfoot>"
#           | "<caption>" (?!^ (("|" | "<tr>" | "<th>") | table part))+ )+
#
#         # optionally extra newline after
#         \n?
#     )+)
#
# 	# optional caption: (conflicts with tom cells, or even just normal rows)
# 	# (?: (\[ [^][\n]+ \]) \n )?
#
# 	# blank line after:
# 	[\n]{1}
# """, handle_table, text, flags=re.VERBOSE)
#
# print(f'text = """{text}"""');
#
# text += """
# <style>
# table
# {
#   border-collapse: collapse;
#   margin: 1em 0;
# }
#
# th, td
# {
#   border: thin solid grey;
#   padding: 0.5em;
# }
# </style>
# """;
#
# with open("test.html", "w") as stream:
# 	stream.write(text);
#