432 lines
12 KiB
C
432 lines
12 KiB
C
|
|
#include <sys/param.h>
|
|
|
|
#include <assert.h>
|
|
|
|
#include <debug.h>
|
|
|
|
#include <memory/srealloc.h>
|
|
|
|
#include <string/new.h>
|
|
#include <string/free.h>
|
|
|
|
#include "../wcistream/struct.h"
|
|
#include "../wcistream/read.h"
|
|
|
|
#include "../position/struct.h"
|
|
#include "../position/clone.h"
|
|
#include "../position/inc.h"
|
|
#include "../position/assign.h"
|
|
#include "../position/free.h"
|
|
|
|
#include "../token/new.h"
|
|
#include "../token/inc.h"
|
|
#include "../token/free.h"
|
|
|
|
#include "struct.h"
|
|
#include "next.h"
|
|
|
|
static const enum state {
|
|
s_error,
|
|
|
|
s_EOF,
|
|
|
|
s_number,
|
|
|
|
s_colon,
|
|
s_comma,
|
|
s_equals,
|
|
s_semicolon,
|
|
s_oparen,
|
|
s_cparen,
|
|
|
|
s_newline,
|
|
|
|
s_identifier,
|
|
|
|
s_start,
|
|
|
|
s_reading_newline,
|
|
|
|
s_reading_slash,
|
|
|
|
s_skipping_comment,
|
|
s_skipping_comment_slash,
|
|
|
|
s_reading_colon,
|
|
s_reading_comma,
|
|
s_reading_equals,
|
|
s_reading_semicolon,
|
|
s_reading_oparen,
|
|
s_reading_cparen,
|
|
|
|
s_reading_number,
|
|
|
|
s_reading_identifier,
|
|
|
|
number_of_states,
|
|
} lookup[number_of_states][127 + 1 + 1] = {
|
|
|
|
#define ANY 0 ... 128
|
|
|
|
// EOF:
|
|
[s_start][0] = s_EOF,
|
|
|
|
// skip whitespace
|
|
[s_start][' '] = s_start,
|
|
[s_start]['\t'] = s_start,
|
|
[s_start]['\n'] = s_reading_newline,
|
|
[s_reading_newline][ANY] = s_newline,
|
|
|
|
// skip comments:
|
|
[s_start]['#'] = s_skipping_comment,
|
|
[s_skipping_comment][ANY] = s_skipping_comment,
|
|
[s_skipping_comment]['\\'] = s_skipping_comment_slash,
|
|
[s_skipping_comment_slash][ANY] = s_skipping_comment,
|
|
[s_skipping_comment]['\n'] = s_start,
|
|
|
|
// skip escaped newlines:
|
|
[s_start]['\\'] = s_reading_slash,
|
|
[s_reading_slash][ANY] = s_start,
|
|
|
|
// symbols:
|
|
[s_start][':'] = s_reading_colon,
|
|
[s_reading_colon][ANY] = s_colon,
|
|
[s_start][','] = s_reading_comma,
|
|
[s_reading_comma][ANY] = s_comma,
|
|
[s_start][';'] = s_reading_semicolon,
|
|
[s_reading_semicolon][ANY] = s_semicolon,
|
|
|
|
// brackets:
|
|
[s_start]['('] = s_reading_oparen,
|
|
[s_reading_oparen][ANY] = s_oparen,
|
|
[s_start][')'] = s_reading_cparen,
|
|
[s_reading_cparen][ANY] = s_cparen,
|
|
|
|
// numeric literals:
|
|
[s_start]['0' ... '9'] = s_reading_number,
|
|
[s_reading_number][ ANY ] = s_number,
|
|
[s_reading_number]['.'] = s_reading_number,
|
|
[s_reading_number]['0' ... '9'] = s_reading_number,
|
|
|
|
// identifiers
|
|
[s_start]['?'] = s_reading_identifier,
|
|
[s_start]['!'] = s_reading_identifier,
|
|
[s_start]['-'] = s_reading_identifier,
|
|
[s_start]['>'] = s_reading_identifier,
|
|
[s_start]['='] = s_reading_identifier,
|
|
[s_start]['<'] = s_reading_identifier,
|
|
[s_start]['+'] = s_reading_identifier,
|
|
[s_start]['*'] = s_reading_identifier,
|
|
[s_start]['_'] = s_reading_identifier,
|
|
[s_start]['-'] = s_reading_identifier,
|
|
[s_start]['/'] = s_reading_identifier,
|
|
[s_start]['a' ... 'z'] = s_reading_identifier,
|
|
[s_start]['A' ... 'Z'] = s_reading_identifier,
|
|
[s_start][128] = s_reading_identifier,
|
|
[s_reading_identifier][ANY] = s_identifier,
|
|
[s_reading_identifier]['!'] = s_reading_identifier,
|
|
[s_reading_identifier]['+'] = s_reading_identifier,
|
|
[s_reading_identifier]['*'] = s_reading_identifier,
|
|
[s_reading_identifier]['_'] = s_reading_identifier,
|
|
[s_reading_identifier]['/'] = s_reading_identifier,
|
|
[s_reading_identifier]['-'] = s_reading_identifier,
|
|
[s_reading_identifier]['a' ... 'z'] = s_reading_identifier,
|
|
[s_reading_identifier]['A' ... 'Z'] = s_reading_identifier,
|
|
[s_reading_identifier]['0' ... '9'] = s_reading_identifier,
|
|
[s_reading_identifier][128] = s_reading_identifier,
|
|
};
|
|
|
|
void tokenizer_next(
|
|
struct tokenizer* this)
|
|
{
|
|
ENTER;
|
|
|
|
if (this->put_back)
|
|
{
|
|
free_token(this->token);
|
|
|
|
this->token = inc_token(this->put_back);
|
|
|
|
free_token(this->put_back);
|
|
|
|
this->put_back = NULL;
|
|
}
|
|
else
|
|
{
|
|
this->rawtoken.n = 0;
|
|
|
|
void append(wchar_t c)
|
|
{
|
|
ENTER;
|
|
|
|
if (this->rawtoken.n == this->rawtoken.cap)
|
|
{
|
|
this->rawtoken.cap = this->rawtoken.cap << 1 ?: 1;
|
|
|
|
this->rawtoken.data = srealloc(
|
|
this->rawtoken.data,
|
|
sizeof(*this->rawtoken.data) * this->rawtoken.cap);
|
|
}
|
|
|
|
this->rawtoken.data[this->rawtoken.n++] = c;
|
|
|
|
EXIT;
|
|
}
|
|
|
|
struct position* start_position = clone_position(this->position);
|
|
struct position* end_position = this->position;
|
|
|
|
enum state state = s_start;
|
|
|
|
while (state >= s_start)
|
|
{
|
|
dpvu(this->stream->wc);
|
|
dpvwc(this->stream->wc);
|
|
|
|
state = lookup[state][MIN(this->stream->wc, 128)];
|
|
|
|
if (state > s_start)
|
|
{
|
|
append(this->stream->wc);
|
|
|
|
switch (this->stream->wc)
|
|
{
|
|
case '\t': end_position->column += 4; break;
|
|
|
|
case '\n': end_position->line++, end_position->column = 1; break;
|
|
|
|
default: end_position->column++; break;
|
|
}
|
|
|
|
wcistream_read(this->stream);
|
|
}
|
|
|
|
if (state == s_start)
|
|
{
|
|
this->rawtoken.n = 0;
|
|
|
|
assign_position(start_position, end_position);
|
|
|
|
wcistream_read(this->stream);
|
|
}
|
|
}
|
|
|
|
append(0), this->rawtoken.n--;
|
|
|
|
free_token(this->token), this->token = NULL;
|
|
|
|
struct position* end_clone = clone_position(end_position);
|
|
|
|
switch (state)
|
|
{
|
|
case s_error:
|
|
{
|
|
dpvws(this->rawtoken.data);
|
|
|
|
TODO;
|
|
|
|
break;
|
|
}
|
|
|
|
case s_EOF:
|
|
{
|
|
this->token = new_token(
|
|
/* kind: */ tk_EOF,
|
|
/* text: */ NULL,
|
|
/* start position: */ end_clone,
|
|
/* end position: */ end_clone);
|
|
|
|
break;
|
|
}
|
|
|
|
case s_newline:
|
|
{
|
|
struct string* text = new_string(
|
|
/* data: */ this->rawtoken.data,
|
|
/* len: */ this->rawtoken.n);
|
|
|
|
this->token = new_token(
|
|
/* kind: */ tk_newline,
|
|
/* text: */ text,
|
|
/* start position: */ start_position,
|
|
/* end position: */ end_clone);
|
|
|
|
free_string(text);
|
|
|
|
break;
|
|
}
|
|
|
|
case s_identifier:
|
|
{
|
|
dpvws(this->rawtoken.data);
|
|
|
|
struct string* text = new_string(
|
|
/* data: */ this->rawtoken.data,
|
|
/* len: */ this->rawtoken.n);
|
|
|
|
if (!wcscmp(this->rawtoken.data, L"λ"))
|
|
{
|
|
this->token = new_token(
|
|
/* kind: */ tk_lambda,
|
|
/* text: */ text,
|
|
/* start position: */ start_position,
|
|
/* end position: */ end_clone);
|
|
}
|
|
else if (!wcscmp(this->rawtoken.data, L"<-"))
|
|
{
|
|
this->token = new_token(
|
|
/* kind: */ tk_arrow,
|
|
/* text: */ text,
|
|
/* start position: */ start_position,
|
|
/* end position: */ end_clone);
|
|
}
|
|
else
|
|
{
|
|
this->token = new_token(
|
|
/* kind: */ tk_identifier,
|
|
/* text: */ text,
|
|
/* start position: */ start_position,
|
|
/* end position: */ end_clone);
|
|
}
|
|
|
|
free_string(text);
|
|
|
|
break;
|
|
}
|
|
|
|
case s_colon:
|
|
{
|
|
struct string* text = new_string(
|
|
/* data: */ this->rawtoken.data,
|
|
/* len: */ this->rawtoken.n);
|
|
|
|
this->token = new_token(
|
|
/* kind: */ tk_colon,
|
|
/* text: */ text,
|
|
/* start position: */ start_position,
|
|
/* end position: */ end_clone);
|
|
|
|
free_string(text);
|
|
|
|
break;
|
|
}
|
|
|
|
case s_comma:
|
|
{
|
|
struct string* text = new_string(
|
|
/* data: */ this->rawtoken.data,
|
|
/* len: */ this->rawtoken.n);
|
|
|
|
this->token = new_token(
|
|
/* kind: */ tk_comma,
|
|
/* text: */ text,
|
|
/* start position: */ start_position,
|
|
/* end position: */ end_clone);
|
|
|
|
free_string(text);
|
|
|
|
break;
|
|
}
|
|
|
|
case s_semicolon:
|
|
{
|
|
struct string* text = new_string(
|
|
/* data: */ this->rawtoken.data,
|
|
/* len: */ this->rawtoken.n);
|
|
|
|
this->token = new_token(
|
|
/* kind: */ tk_semicolon,
|
|
/* text: */ text,
|
|
/* start position: */ start_position,
|
|
/* end position: */ end_clone);
|
|
|
|
free_string(text);
|
|
|
|
break;
|
|
}
|
|
|
|
case s_oparen:
|
|
{
|
|
struct string* text = new_string(
|
|
/* data: */ this->rawtoken.data,
|
|
/* len: */ this->rawtoken.n);
|
|
|
|
this->token = new_token(
|
|
/* kind: */ tk_oparen,
|
|
/* text: */ text,
|
|
/* start position: */ start_position,
|
|
/* end position: */ end_clone);
|
|
|
|
free_string(text);
|
|
|
|
break;
|
|
}
|
|
|
|
case s_cparen:
|
|
{
|
|
struct string* text = new_string(
|
|
/* data: */ this->rawtoken.data,
|
|
/* len: */ this->rawtoken.n);
|
|
|
|
this->token = new_token(
|
|
/* kind: */ tk_cparen,
|
|
/* text: */ text,
|
|
/* start position: */ start_position,
|
|
/* end position: */ end_clone);
|
|
|
|
free_string(text);
|
|
|
|
break;
|
|
}
|
|
|
|
case s_number:
|
|
{
|
|
struct string* text = new_string(
|
|
/* data: */ this->rawtoken.data,
|
|
/* len: */ this->rawtoken.n);
|
|
|
|
this->token = new_token(
|
|
/* kind: */ tk_literal,
|
|
/* text: */ text,
|
|
/* start position: */ start_position,
|
|
/* end position: */ end_clone);
|
|
|
|
free_string(text);
|
|
|
|
break;
|
|
}
|
|
|
|
default:
|
|
TODO;
|
|
break;
|
|
}
|
|
|
|
free_position(start_position);
|
|
|
|
free_position(end_clone);
|
|
}
|
|
|
|
EXIT;
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|