lambda-calc-1/parse/parse.c
2025-01-20 13:40:51 -06:00

648 lines
16 KiB
C

#include <stdlib.h>
#include <assert.h>
#include <stdbool.h>
#include <debug.h>
#include <string/struct.h>
#include <string/free.h>
#include <number/new.h>
#include <number/set_str.h>
#include <number/free.h>
#include <memory/srealloc.h>
#include <value/number/new.h>
#include <value/free.h>
#include <string/inc.h>
#include <expression/literal/new.h>
#include <expression/variable/new.h>
#include <expression/lambda/new.h>
#include <expression/application/new.h>
#include <expression/parenthesis/new.h>
#include <expression/error/new.h>
#include <expression/inc.h>
#include <expression/free.h>
#include <statement/expression/new.h>
#include <statement/substatements/new.h>
#include <statement/assignment/new.h>
#include <statement/error/new.h>
#include <statement/inc.h>
#include <statement/free.h>
#include "token/struct.h"
#include "token/inc.h"
#include "token/free.h"
#include "tokenizer/struct.h"
#include "tokenizer/put_back.h"
#include "tokenizer/next.h"
#include "parse.h"
// we need to gracefully handle errors
// we also need to allow for newlines ending statements only when they \
could.
struct expression* parse_application_expression(
struct tokenizer* tokenizer,
bool in_parentheses);
struct expression* parse_lambda_expression(
struct tokenizer* tokenizer,
bool in_parentheses)
{
ENTER;
assert(tokenizer->token->kind == tk_lambda);
tokenizer_next(tokenizer);
while (tokenizer->token->kind == tk_newline)
{
tokenizer_next(tokenizer);
}
if (tokenizer->token->kind != tk_identifier)
{
TODO;
exit(1);
}
struct {
struct string** data;
size_t n, cap;
} names = {};
while (tokenizer->token->kind == tk_identifier)
{
if (names.n == names.cap)
{
names.cap = names.cap << 1 ?: 1;
names.data = srealloc(names.data,
sizeof(*names.data) * names.cap);
}
names.data[names.n++] = inc_string(tokenizer->token->text);
tokenizer_next(tokenizer);
while (tokenizer->token->kind == tk_newline)
{
tokenizer_next(tokenizer);
}
if (tokenizer->token->kind == tk_comma)
{
tokenizer_next(tokenizer);
}
}
assert(names.n);
while (tokenizer->token->kind == tk_newline)
{
tokenizer_next(tokenizer);
}
if (true
&& tokenizer->token->kind != tk_colon
&& tokenizer->token->kind != tk_dot)
{
dpvu(tokenizer->token->kind);
TODO;
exit(1);
}
tokenizer_next(tokenizer);
struct expression* body = parse_application_expression(
tokenizer, in_parentheses);
struct expression* retval = inc_expression(body);
for (size_t i = names.n; i > 0; i--)
{
struct expression* prev = retval;
retval = new_lambda_expression(
/* variable name: */ names.data[i - 1],
/* body: */ prev);
free_expression(prev);
}
for (size_t i = 0; i < names.n; i++)
{
free_string(names.data[i]);
}
free_expression(body);
free(names.data);
EXIT;
return retval;
}
// def parse-primary():
// literal
// variable
// parenthesis:
// parse-application(in-parenthesis = true)
// lambda:
// eat zero or more newlines
// if error token:
// return error expression ("unknown token when reading \
lambda expression")
// while not colon:
// if error token:
// return error expression ("unknown token when reading \
lambda expression")
// expect identifier:
// return error expression ("unexpected token when reading \
lambda expression")
// add to variable list
// eat identifier
// eat zero or more newlines
// if error token:
// return error expression ("unknown token when \
reading lambda expression")
// expect comma
// return error expression ("unexpected token when reading \
lambda expression, expected comma or colon")
// eat comma
// eat colon
// eat zero or more newlines
// if error token:
// return error expression ("unknown token when reading \
lambda expression")
// parse-application
// tk_error:
// create error expression ("unknown token")
// EOF:
// create error expression ("unexpected EOF, expected expression")
// default:
// create error expression ("unexpected token")
struct expression* parse_primary_expression(
struct tokenizer* tokenizer,
bool in_parentheses)
{
struct expression* retval;
ENTER;
switch (tokenizer->token->kind)
{
case tk_newline:
{
TODO;
break;
}
case tk_literal:
{
struct number* literal = new_number();
number_set_str(
/* instance: */ literal,
/* literal string: */ tokenizer->token->text->data);
struct value* value = new_number_value(literal);
retval = new_literal_expression(value);
tokenizer_next(tokenizer);
free_value(value);
free_number(literal);
break;
}
case tk_identifier:
{
retval = new_variable_expression(
/* variable name: */ tokenizer->token->text);
tokenizer_next(tokenizer);
break;
}
case tk_oparen:
{
tokenizer_next(tokenizer);
struct expression* subexpression =
parse_application_expression(tokenizer,
/* in parenthesis: */ true);
switch (tokenizer->token->kind)
{
case tk_cparen:
retval = new_parenthesis_expression(subexpression);
break;
case tk_EOF:
{
retval = new_error_expression(
L"Unexpected EOF when reading parenthesis. "
L"Expecting close parenthesis.",
subexpression);
break;
}
case tk_semicolon:
TODO;
break;
case tk_error:
TODO;
break;
default:
TODO;
break;
}
tokenizer_next(tokenizer);
free_expression(subexpression);
break;
}
case tk_lambda:
{
retval = parse_lambda_expression(tokenizer, in_parentheses);
break;
}
case tk_error:
TODO;
break;
default:
TODO;
break;
}
EXIT;
return retval;
}
// # parse-application is greedy, and since it's called for parsing the \
# body of lambdas, and lambdas inside that lambda, etc. the shift/reduce \
# error will work out in our favor
// def parse-application(in-parenthesis = false):
// parse parse-primary
// # newlines can't stop us if we're in parenthesis:
// if in-parenthesis:
// while current token is newline:
// eat the newline
// match (current token):
// case (number, variable, oparen, lambda):
// parse that primary, connect the two with an 'application' \
expression
// continue as you can
// case (close paren, EOF, newline, semicolon):
// if in-parenthesis = false:
// create error expression
// otherwise:
// return what you have
// tk_error:
// create error expression ("unknown token")
// connect the two with an 'application' expression
// return what you have
struct expression* parse_application_expression(
struct tokenizer* tokenizer,
bool in_parentheses)
{
struct expression* retval;
ENTER;
while (tokenizer->token->kind == tk_newline)
{
tokenizer_next(tokenizer);
}
switch (tokenizer->token->kind)
{
case tk_identifier:
case tk_lambda:
case tk_literal:
case tk_oparen:
{
retval = parse_primary_expression(
/* tokenizer: */ tokenizer,
/* in parenthesis? */ in_parentheses);
break;
}
case tk_error:
TODO;
break;
case tk_EOF:
{
retval = new_error_expression(L"Unexpected EOF.", NULL);
break;
}
default:
TODO;
break;
}
// is the next thing a part of the application?
again: switch (tokenizer->token->kind)
{
case tk_identifier:
case tk_lambda:
case tk_literal:
case tk_oparen:
{
struct expression* left = retval;
struct expression* right = parse_primary_expression(
tokenizer, in_parentheses);
retval = new_application_expression(left, right);
free_expression(left);
free_expression(right);
goto again;
}
case tk_newline:
{
if (in_parentheses)
{
TODO;
}
break;
}
case tk_semicolon:
case tk_EOF:
case tk_cparen:
{
break;
}
case tk_error:
{
TODO;
break;
}
default:
{
TODO;
break;
}
}
EXIT;
return retval;
}
struct statement* parse_statement(
struct tokenizer* tokenizer)
{
struct statement* retval;
ENTER;
switch (tokenizer->token->kind)
{
case tk_identifier:
{
struct token* token = inc_token(tokenizer->token);
tokenizer_next(tokenizer);
switch (tokenizer->token->kind)
{
case tk_arrow:
{
tokenizer_next(tokenizer);
struct expression* expression =
parse_application_expression(
/* tokenizer: */ tokenizer,
/* in parenthesis: */ false);
retval = new_assignment_statement(
/* variable name: */ token->text,
/* expression: */ expression);
free_expression(expression);
break;
}
case tk_newline:
case tk_EOF:
case tk_semicolon:
{
struct expression* expression = new_variable_expression(
/* variable name: */ token->text);
retval = new_expression_statement(expression);
free_expression(expression);
break;
}
case tk_oparen:
case tk_identifier:
case tk_literal:
case tk_lambda:
{
tokenizer_put_back(tokenizer, token);
struct expression* exp = parse_application_expression(
/* tokenizer: */ tokenizer,
/* in parenthesis? */ false);
retval = new_expression_statement(
/* expression: */ exp);
free_expression(exp);
break;
}
case tk_error:
TODO;
// create error statement ("unknown token",
// stragglers = [first token])
// return what you have
break;
default:
TODO;
break;
}
free_token(token);
break;
}
case tk_lambda:
case tk_literal:
case tk_oparen:
{
// call parse-application
struct expression* exp = parse_application_expression(
/* tokenizer: */ tokenizer,
/* in parenthesis? */ false);
retval = new_expression_statement(
/* expression: */ exp);
free_expression(exp);
break;
}
case tk_semicolon:
TODO;
break;
case tk_cparen:
{
retval = new_error_statement(
L"Unexpected close parenthesis when reading statement", NULL);
break;
}
case tk_error:
TODO;
// create error statement ("unknown token")
// return what you have
break;
default:
TODO;
// create error statement ("unexpected token")
// return what you have
break;
}
EXIT;
return retval;
}
struct statement* parse_statements(
struct tokenizer* tokenizer)
{
ENTER;
assert(tokenizer->token->kind != tk_newline);
assert(tokenizer->token->kind != tk_EOF);
struct statement* left = NULL;
while (true
&& tokenizer->token->kind != tk_newline
&& tokenizer->token->kind != tk_EOF)
{
struct statement* right = parse_statement(tokenizer);
if (left)
{
struct statement* oldleft = left;
left = new_substatements_statement(left, right);
free_statement(oldleft);
}
else
{
left = inc_statement(right);
}
while (tokenizer->token->kind == tk_cparen)
{
struct statement* oldleft = left;
left = new_error_statement(L"Extra close parenthesis.", left);
free_statement(oldleft);
tokenizer_next(tokenizer);
}
while (tokenizer->token->kind == tk_semicolon)
{
tokenizer_next(tokenizer);
}
free_statement(right);
}
if (tokenizer->token->kind == tk_newline)
{
tokenizer_next(tokenizer);
}
assert(left);
EXIT;
return left;
}
struct statement* parse(
struct tokenizer* tokenizer)
{
ENTER;
struct statement* root = parse_statements(tokenizer);
EXIT;
return root;
}