lisp-take-1/parse/tokenizer/next.c

356 lines
9.8 KiB
C

#include <assert.h>
#include <debug.h>
#include <memory/srealloc.h>
#include <parse/istream/struct.h>
#include <parse/istream/read.h>
#include "struct.h"
#include "next.h"
static const enum state {
s_error,
s_EOF,
s_null,
s_true,
s_false,
s_integer,
// symbols:
s_gravemark,
s_oparen,
s_cparen,
s_identifier,
s_start,
s_skipping_comment,
s_reading_oparen,
s_reading_cparen,
s_reading_integer,
s_reading_identifier,
s_reading_gravemark,
s_reading_n,
s_reading_nu,
s_reading_nul,
s_reading_null,
s_reading_t,
s_reading_tr,
s_reading_tru,
s_reading_true,
s_reading_f,
s_reading_fa,
s_reading_fal,
s_reading_fals,
s_reading_false,
number_of_states,
} lookup[number_of_states][256] = {
#define ANY 0 ... 255
// EOF:
[s_start][0] = s_EOF,
// skip whitespace
[s_start][' '] = s_start,
[s_start]['\t'] = s_start,
[s_start]['\n'] = s_start,
// skip comments:
[s_start]['#'] = s_skipping_comment,
[s_skipping_comment][ANY] = s_skipping_comment,
[s_skipping_comment]['\n'] = s_start,
// brackets:
[s_start]['('] = s_reading_oparen,
[s_reading_oparen][ANY] = s_oparen,
[s_start][')'] = s_reading_cparen,
[s_reading_cparen][ANY] = s_cparen,
// symbols:
[s_start]['`'] = s_reading_gravemark,
[s_reading_gravemark][ANY] = s_gravemark,
// integer literals:
[s_start]['0' ... '9'] = s_reading_integer,
[s_reading_integer][ ANY ] = s_integer,
[s_reading_integer]['0' ... '9'] = s_reading_integer,
// identifiers
[s_start]['?'] = s_reading_identifier,
[s_start]['!'] = s_reading_identifier,
[s_start]['>'] = s_reading_identifier,
[s_start]['<'] = s_reading_identifier,
[s_start]['='] = s_reading_identifier,
[s_start]['+'] = s_reading_identifier,
[s_start]['*'] = s_reading_identifier,
[s_start]['_'] = s_reading_identifier,
[s_start]['-'] = s_reading_identifier,
[s_start]['/'] = s_reading_identifier,
[s_start]['a' ... 'z'] = s_reading_identifier,
[s_start]['A' ... 'Z'] = s_reading_identifier,
[s_reading_identifier][ANY] = s_identifier,
[s_reading_identifier][':'] = s_reading_identifier,
[s_reading_identifier]['='] = s_reading_identifier,
[s_reading_identifier]['!'] = s_reading_identifier,
[s_reading_identifier]['+'] = s_reading_identifier,
[s_reading_identifier]['*'] = s_reading_identifier,
[s_reading_identifier]['_'] = s_reading_identifier,
[s_reading_identifier]['/'] = s_reading_identifier,
[s_reading_identifier]['-'] = s_reading_identifier,
[s_reading_identifier]['a' ... 'z'] = s_reading_identifier,
[s_reading_identifier]['A' ... 'Z'] = s_reading_identifier,
[s_reading_identifier]['0' ... '9'] = s_reading_identifier,
// "null" keyword:
[s_start]['n'] = s_reading_n,
[s_reading_n][ANY] = s_identifier,
[s_reading_n]['_'] = s_reading_identifier,
[s_reading_n]['-'] = s_reading_identifier,
[s_reading_n]['a' ... 'z'] = s_reading_identifier,
[s_reading_n]['A' ... 'Z'] = s_reading_identifier,
[s_reading_n]['u'] = s_reading_nu,
[s_reading_nu][ANY] = s_identifier,
[s_reading_nu]['_'] = s_reading_identifier,
[s_reading_nu]['-'] = s_reading_identifier,
[s_reading_nu]['a' ... 'z'] = s_reading_identifier,
[s_reading_nu]['A' ... 'Z'] = s_reading_identifier,
[s_reading_nu]['l'] = s_reading_nul,
[s_reading_nul][ANY] = s_identifier,
[s_reading_nul]['_'] = s_reading_identifier,
[s_reading_nul]['-'] = s_reading_identifier,
[s_reading_nul]['a' ... 'z'] = s_reading_identifier,
[s_reading_nul]['A' ... 'Z'] = s_reading_identifier,
[s_reading_nul]['l'] = s_reading_null,
[s_reading_null][ANY] = s_null,
[s_reading_null]['_'] = s_reading_identifier,
[s_reading_null]['-'] = s_reading_identifier,
[s_reading_null]['a' ... 'z'] = s_reading_identifier,
[s_reading_null]['A' ... 'Z'] = s_reading_identifier,
// "true" keyword:
[s_start]['t'] = s_reading_t,
[s_reading_t][ANY] = s_identifier,
[s_reading_t]['_'] = s_reading_identifier,
[s_reading_t]['-'] = s_reading_identifier,
[s_reading_t]['a' ... 'z'] = s_reading_identifier,
[s_reading_t]['A' ... 'Z'] = s_reading_identifier,
[s_reading_t]['r'] = s_reading_tr,
[s_reading_tr][ANY] = s_identifier,
[s_reading_tr]['_'] = s_reading_identifier,
[s_reading_tr]['-'] = s_reading_identifier,
[s_reading_tr]['a' ... 'z'] = s_reading_identifier,
[s_reading_tr]['A' ... 'Z'] = s_reading_identifier,
[s_reading_tr]['u'] = s_reading_tru,
[s_reading_tru][ANY] = s_identifier,
[s_reading_tru]['_'] = s_reading_identifier,
[s_reading_tru]['-'] = s_reading_identifier,
[s_reading_tru]['a' ... 'z'] = s_reading_identifier,
[s_reading_tru]['A' ... 'Z'] = s_reading_identifier,
[s_reading_tru]['e'] = s_reading_true,
[s_reading_true][ANY] = s_true,
[s_reading_true]['_'] = s_reading_identifier,
[s_reading_true]['-'] = s_reading_identifier,
[s_reading_true]['a' ... 'z'] = s_reading_identifier,
[s_reading_true]['A' ... 'Z'] = s_reading_identifier,
// "false" keyword:
[s_start]['f'] = s_reading_f,
[s_reading_f][ANY] = s_identifier,
[s_reading_f]['_'] = s_reading_identifier,
[s_reading_f]['-'] = s_reading_identifier,
[s_reading_f]['a' ... 'z'] = s_reading_identifier,
[s_reading_f]['A' ... 'Z'] = s_reading_identifier,
[s_reading_f]['a'] = s_reading_fa,
[s_reading_fa][ANY] = s_identifier,
[s_reading_fa]['_'] = s_reading_identifier,
[s_reading_fa]['-'] = s_reading_identifier,
[s_reading_fa]['a' ... 'z'] = s_reading_identifier,
[s_reading_fa]['A' ... 'Z'] = s_reading_identifier,
[s_reading_fa]['l'] = s_reading_fal,
[s_reading_fal][ANY] = s_identifier,
[s_reading_fal]['_'] = s_reading_identifier,
[s_reading_fal]['-'] = s_reading_identifier,
[s_reading_fal]['a' ... 'z'] = s_reading_identifier,
[s_reading_fal]['A' ... 'Z'] = s_reading_identifier,
[s_reading_fal]['s'] = s_reading_fals,
[s_reading_fals][ANY] = s_identifier,
[s_reading_fals]['_'] = s_reading_identifier,
[s_reading_fals]['-'] = s_reading_identifier,
[s_reading_fals]['a' ... 'z'] = s_reading_identifier,
[s_reading_fals]['A' ... 'Z'] = s_reading_identifier,
[s_reading_fals]['e'] = s_reading_false,
[s_reading_false][ANY] = s_false,
[s_reading_false]['_'] = s_reading_identifier,
[s_reading_false]['-'] = s_reading_identifier,
[s_reading_false]['a' ... 'z'] = s_reading_identifier,
[s_reading_false]['A' ... 'Z'] = s_reading_identifier,
};
void tokenizer_next_token(
struct tokenizer* this)
{
ENTER;
this->rawtoken.n = 0;
void append(uint8_t c)
{
ENTER;
if (this->rawtoken.n == this->rawtoken.cap)
{
this->rawtoken.cap = this->rawtoken.cap << 1 ?: 1;
this->rawtoken.data = srealloc(
this->rawtoken.data,
sizeof(*this->rawtoken.data) * this->rawtoken.cap);
}
this->rawtoken.data[this->rawtoken.n++] = c;
EXIT;
}
enum state state = s_start;
while (state >= s_start)
{
dpvc(this->stream->c);
state = lookup[state][this->stream->c];
if (state > s_start)
{
append(this->stream->c);
istream_read(this->stream);
}
if (state == s_start)
{
this->rawtoken.n = 0;
istream_read(this->stream);
}
}
append(0), this->rawtoken.n--;
switch (state)
{
case s_error:
{
TODO;
break;
}
case s_EOF:
{
this->token = t_EOF;
break;
}
case s_gravemark:
{
this->token = t_gravemark;
break;
}
case s_oparen:
{
this->token = t_oparen;
break;
}
case s_cparen:
{
this->token = t_cparen;
break;
}
case s_null:
{
this->token = t_null;
break;
}
case s_true:
{
this->token = t_true;
break;
}
case s_false:
{
this->token = t_false;
break;
}
case s_identifier:
{
this->token = t_identifier;
break;
}
case s_integer:
{
this->token = t_integer;
break;
}
default:
TODO;
break;
}
EXIT;
}