lambda-calc-1/parse/tokenizer/next.c
2025-01-20 13:40:51 -06:00

432 lines
12 KiB
C

#include <sys/param.h>
#include <assert.h>
#include <debug.h>
#include <memory/srealloc.h>
#include <string/new.h>
#include <string/free.h>
#include <wcistream/struct.h>
#include <wcistream/read.h>
#include "../position/struct.h"
#include "../position/clone.h"
#include "../position/inc.h"
#include "../position/assign.h"
#include "../position/free.h"
#include "../token/new.h"
#include "../token/inc.h"
#include "../token/free.h"
#include "struct.h"
#include "next.h"
static const enum state {
s_error,
s_EOF,
s_number,
s_colon,
s_comma,
s_equals,
s_semicolon,
s_oparen,
s_cparen,
s_newline,
s_identifier,
s_start,
s_reading_newline,
s_reading_slash,
s_skipping_comment,
s_skipping_comment_slash,
s_reading_colon,
s_reading_comma,
s_reading_equals,
s_reading_semicolon,
s_reading_oparen,
s_reading_cparen,
s_reading_number,
s_reading_identifier,
number_of_states,
} lookup[number_of_states][127 + 1 + 1] = {
#define ANY 0 ... 128
// EOF:
[s_start][0] = s_EOF,
// skip whitespace
[s_start][' '] = s_start,
[s_start]['\t'] = s_start,
[s_start]['\n'] = s_reading_newline,
[s_reading_newline][ANY] = s_newline,
// skip comments:
[s_start]['#'] = s_skipping_comment,
[s_skipping_comment][ANY] = s_skipping_comment,
[s_skipping_comment]['\\'] = s_skipping_comment_slash,
[s_skipping_comment_slash][ANY] = s_skipping_comment,
[s_skipping_comment]['\n'] = s_start,
// skip escaped newlines:
[s_start]['\\'] = s_reading_slash,
[s_reading_slash][ANY] = s_start,
// symbols:
[s_start][':'] = s_reading_colon,
[s_reading_colon][ANY] = s_colon,
[s_start][','] = s_reading_comma,
[s_reading_comma][ANY] = s_comma,
[s_start][';'] = s_reading_semicolon,
[s_reading_semicolon][ANY] = s_semicolon,
// brackets:
[s_start]['('] = s_reading_oparen,
[s_reading_oparen][ANY] = s_oparen,
[s_start][')'] = s_reading_cparen,
[s_reading_cparen][ANY] = s_cparen,
// numeric literals:
[s_start]['0' ... '9'] = s_reading_number,
[s_reading_number][ ANY ] = s_number,
[s_reading_number]['.'] = s_reading_number,
[s_reading_number]['0' ... '9'] = s_reading_number,
// identifiers
[s_start]['?'] = s_reading_identifier,
[s_start]['!'] = s_reading_identifier,
[s_start]['-'] = s_reading_identifier,
[s_start]['>'] = s_reading_identifier,
[s_start]['='] = s_reading_identifier,
[s_start]['<'] = s_reading_identifier,
[s_start]['+'] = s_reading_identifier,
[s_start]['*'] = s_reading_identifier,
[s_start]['_'] = s_reading_identifier,
[s_start]['-'] = s_reading_identifier,
[s_start]['/'] = s_reading_identifier,
[s_start]['a' ... 'z'] = s_reading_identifier,
[s_start]['A' ... 'Z'] = s_reading_identifier,
[s_start][128] = s_reading_identifier,
[s_reading_identifier][ANY] = s_identifier,
[s_reading_identifier]['!'] = s_reading_identifier,
[s_reading_identifier]['+'] = s_reading_identifier,
[s_reading_identifier]['*'] = s_reading_identifier,
[s_reading_identifier]['_'] = s_reading_identifier,
[s_reading_identifier]['/'] = s_reading_identifier,
[s_reading_identifier]['-'] = s_reading_identifier,
[s_reading_identifier]['a' ... 'z'] = s_reading_identifier,
[s_reading_identifier]['A' ... 'Z'] = s_reading_identifier,
[s_reading_identifier]['0' ... '9'] = s_reading_identifier,
[s_reading_identifier][128] = s_reading_identifier,
};
void tokenizer_next(
struct tokenizer* this)
{
ENTER;
if (this->put_back)
{
free_token(this->token);
this->token = inc_token(this->put_back);
free_token(this->put_back);
this->put_back = NULL;
}
else
{
this->rawtoken.n = 0;
void append(wchar_t c)
{
ENTER;
if (this->rawtoken.n == this->rawtoken.cap)
{
this->rawtoken.cap = this->rawtoken.cap << 1 ?: 1;
this->rawtoken.data = srealloc(
this->rawtoken.data,
sizeof(*this->rawtoken.data) * this->rawtoken.cap);
}
this->rawtoken.data[this->rawtoken.n++] = c;
EXIT;
}
struct position* start_position = clone_position(this->position);
struct position* end_position = this->position;
enum state state = s_start;
while (state >= s_start)
{
dpvu(this->stream->wc);
dpvwc(this->stream->wc);
state = lookup[state][MIN(this->stream->wc, 128)];
if (state > s_start)
{
append(this->stream->wc);
switch (this->stream->wc)
{
case '\t': end_position->column += 4; break;
case '\n': end_position->line++, end_position->column = 1; break;
default: end_position->column++; break;
}
wcistream_read(this->stream);
}
if (state == s_start)
{
this->rawtoken.n = 0;
assign_position(start_position, end_position);
wcistream_read(this->stream);
}
}
append(0), this->rawtoken.n--;
free_token(this->token), this->token = NULL;
struct position* end_clone = clone_position(end_position);
switch (state)
{
case s_error:
{
dpvws(this->rawtoken.data);
TODO;
break;
}
case s_EOF:
{
this->token = new_token(
/* kind: */ tk_EOF,
/* text: */ NULL,
/* start position: */ end_clone,
/* end position: */ end_clone);
break;
}
case s_newline:
{
struct string* text = new_string(
/* data: */ this->rawtoken.data,
/* len: */ this->rawtoken.n);
this->token = new_token(
/* kind: */ tk_newline,
/* text: */ text,
/* start position: */ start_position,
/* end position: */ end_clone);
free_string(text);
break;
}
case s_identifier:
{
dpvws(this->rawtoken.data);
struct string* text = new_string(
/* data: */ this->rawtoken.data,
/* len: */ this->rawtoken.n);
if (!wcscmp(this->rawtoken.data, L"λ"))
{
this->token = new_token(
/* kind: */ tk_lambda,
/* text: */ text,
/* start position: */ start_position,
/* end position: */ end_clone);
}
else if (!wcscmp(this->rawtoken.data, L"<-"))
{
this->token = new_token(
/* kind: */ tk_arrow,
/* text: */ text,
/* start position: */ start_position,
/* end position: */ end_clone);
}
else
{
this->token = new_token(
/* kind: */ tk_identifier,
/* text: */ text,
/* start position: */ start_position,
/* end position: */ end_clone);
}
free_string(text);
break;
}
case s_colon:
{
struct string* text = new_string(
/* data: */ this->rawtoken.data,
/* len: */ this->rawtoken.n);
this->token = new_token(
/* kind: */ tk_colon,
/* text: */ text,
/* start position: */ start_position,
/* end position: */ end_clone);
free_string(text);
break;
}
case s_comma:
{
struct string* text = new_string(
/* data: */ this->rawtoken.data,
/* len: */ this->rawtoken.n);
this->token = new_token(
/* kind: */ tk_comma,
/* text: */ text,
/* start position: */ start_position,
/* end position: */ end_clone);
free_string(text);
break;
}
case s_semicolon:
{
struct string* text = new_string(
/* data: */ this->rawtoken.data,
/* len: */ this->rawtoken.n);
this->token = new_token(
/* kind: */ tk_semicolon,
/* text: */ text,
/* start position: */ start_position,
/* end position: */ end_clone);
free_string(text);
break;
}
case s_oparen:
{
struct string* text = new_string(
/* data: */ this->rawtoken.data,
/* len: */ this->rawtoken.n);
this->token = new_token(
/* kind: */ tk_oparen,
/* text: */ text,
/* start position: */ start_position,
/* end position: */ end_clone);
free_string(text);
break;
}
case s_cparen:
{
struct string* text = new_string(
/* data: */ this->rawtoken.data,
/* len: */ this->rawtoken.n);
this->token = new_token(
/* kind: */ tk_cparen,
/* text: */ text,
/* start position: */ start_position,
/* end position: */ end_clone);
free_string(text);
break;
}
case s_number:
{
struct string* text = new_string(
/* data: */ this->rawtoken.data,
/* len: */ this->rawtoken.n);
this->token = new_token(
/* kind: */ tk_literal,
/* text: */ text,
/* start position: */ start_position,
/* end position: */ end_clone);
free_string(text);
break;
}
default:
TODO;
break;
}
free_position(start_position);
free_position(end_clone);
}
EXIT;
}