lexing up and running

This commit is contained in:
Ellen Arvidsson 2025-04-15 20:02:25 +02:00
parent 10e16147ba
commit b9266cdf96
6 changed files with 434 additions and 102 deletions

652
src/msph/token.c Normal file
View file

@ -0,0 +1,652 @@
#include <sys/errno.h>
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <ctype.h>
#include "msph/err.h"
#include "msph/token.h"
struct msph_matcher {
size_t off;
size_t matchlen;
const int type;
};
struct msph_matcher token_matcher[] = {
{ 0, 0, -1 },
{ 0, 0, TOK_LBRACE },
{ 0, 0, TOK_RBRACE },
{ 0, 0, TOK_LBRAK },
{ 0, 0, TOK_RBRAK },
{ 0, 0, TOK_LPAREN },
{ 0, 0, TOK_RPAREN },
{ 0, 0, TOK_COLON },
{ 0, 0, TOK_EQUALS },
{ 0, 0, TOK_AMP },
{ 0, 0, TOK_PIPE },
{ 0, 0, TOK_RARROW },
{ 0, 0, TOK_SUB },
{ 0, 0, TOK_KW_TYPE },
{ 0, 0, TOK_KW_NOMINAL },
{ 0, 0, TOK_KW_MEMBER },
{ 0, 0, TOK_KW_CHECK },
{ 0, 0, TOK_KW_BOX },
{ 0, 0, TOK_KW_FORALL },
{ 0, 0, TOK_CONST_TRUE },
{ 0, 0, TOK_CONST_FALSE },
{ 0, 0, TOK_IDENT },
{ 0, 0, TOK_END }
};
struct msph_matcher wspace = { 0, 0, TOK_WSPACE };
struct msph_token_info {
const int type;
const char *dbg_str;
const char *str;
} token_info[] = {
#define TOK_INFO(tok, s) { tok , #tok, s }
TOK_INFO(TOK_LBRACE, "{"),
TOK_INFO(TOK_RBRACE, "}"),
TOK_INFO(TOK_LBRAK, "["),
TOK_INFO(TOK_RBRAK, "]"),
TOK_INFO(TOK_LPAREN, "("),
TOK_INFO(TOK_RPAREN, ")"),
TOK_INFO(TOK_COLON, ":"),
TOK_INFO(TOK_EQUALS, "="),
TOK_INFO(TOK_AMP, "&"),
TOK_INFO(TOK_PIPE, "|"),
TOK_INFO(TOK_RARROW, "=>"),
TOK_INFO(TOK_SUB, "<:"),
TOK_INFO(TOK_KW_TYPE, "type"),
TOK_INFO(TOK_KW_NOMINAL, "nominal"),
TOK_INFO(TOK_KW_MEMBER, "member"),
TOK_INFO(TOK_KW_CHECK, "check"),
TOK_INFO(TOK_KW_BOX, "box"),
TOK_INFO(TOK_KW_FORALL, "forall"),
TOK_INFO(TOK_CONST_TRUE, "True"),
TOK_INFO(TOK_CONST_FALSE, "False"),
TOK_INFO(TOK_IDENT, NULL),
TOK_INFO(TOK_WSPACE, NULL),
{ TOK_END , NULL, NULL }
#undef TOK_INFO
};
#define BUF_LEN(b) ((sizeof(b) / sizeof((b)[0])))
static ssize_t src_file_fill_buf(struct msph_ctx *,
struct msph_token_src_file *);
static int tok_match(struct msph_ctx *, struct msph_token_src *,
struct msph_matcher *);
static int tok_commit(struct msph_ctx *, struct msph_token_src *,
struct msph_matcher *, struct msph_token *);
static int char_at(struct msph_ctx *, struct msph_token_src *, size_t,
char *);
static int fromcbuf_charcpy(char *, const char *, size_t, size_t, size_t);
static int file_char_at(struct msph_ctx *, struct msph_token_src *, size_t,
char *out);
static int read_single_tok(struct msph_token *, struct msph_token_stream *);
static const char *tok_base_str(struct msph_token *);
void msph_ctx_init(struct msph_ctx *ctx)
{
ctx->err = 0;
ctx->err_info = 0;
}
struct msph_token_stream *
msph_token_stream_file(struct msph_ctx *ctx, FILE *f, const char *name)
{
size_t res;
struct msph_token_stream *ret;
if (ctx == NULL || f == NULL || name == NULL) {
MSPH_ERR(ctx, MSPH_ERR_INVAL);
return (NULL);
}
if ((ret = calloc(1, sizeof(struct msph_token_stream))) == NULL) {
MSPH_ERR(ctx, MSPH_ERR_SYS);
goto err;
}
ret->ctx = ctx;
ret->src.type = MSPH_TOKEN_SRC_FILE;
ret->src.inner.file.f = f;
ret->src.inner.file.pos = 0;
ret->src.inner.file.end = 0;
res = strlcpy(ret->src.inner.file.name, name,
sizeof(ret->src.inner.file.name));
if (res >= sizeof(ret->src.inner.file.name)) {
MSPH_ERR(ctx, MSPH_ERR_TOOLONG);
goto err;
}
return (ret);
err:
if (fclose(f) == EOF)
abort();
if (ret != NULL)
free(ret);
return (NULL);
}
struct msph_token_stream *
msph_token_stream_frombuf(struct msph_ctx *ctx, const char *buf, size_t len)
{
struct msph_token_stream *ret;
if ((ret = calloc(1, sizeof(struct msph_token_stream))) == NULL) {
MSPH_ERR(ctx, MSPH_ERR_SYS);
return (NULL);
}
ret->ctx = ctx;
ret->src.type = MSPH_TOKEN_SRC_STR;
ret->src.inner.str.s = buf;
ret->src.inner.str.len = len;
ret->src.inner.str.pos = 0;
return (ret);
}
size_t
msph_token_str(char *buf, size_t len, struct msph_token *tok)
{
size_t ret;
ret = snprintf(buf, len, "%s", tok_base_str(tok));
if (ret > len)
return (ret);
len -= ret;
buf += ret;
switch (tok->type) {
case TOK_IDENT:
ret += snprintf(buf, len, "(%s)", tok->d.s.buf);
break;
default:
break;
}
return (ret);
}
#define MSPH_TOKEN_PRINT_BUF_LEN 2 * MSPH_TOKEN_BUF_LEN
int
msph_token_stream_print(struct msph_token_stream *s, FILE *out)
{
ssize_t ret;
ssize_t i;
struct msph_token tok;
char tokstr[MSPH_TOKEN_PRINT_BUF_LEN];
while ((ret = msph_token_stream_read_tok( &tok, 1, s)) > 0) {
SPHO_DEBUG_PRINT("msph_token_stream_print: ret=%zd\n", ret);
if (msph_token_str(
tokstr, BUF_LEN(tokstr), &tok) > BUF_LEN(tokstr)) {
tokstr[BUF_LEN(tokstr) - 1] = '\0';
}
fprintf(out, "%s\n", tokstr);
}
SPHO_DEBUG_PRINT("msph_token_stream_print: ret=%zd\n", ret);
return ((int)ret);
}
int
msph_token_stream_close(struct msph_token_stream *s)
{
int ret;
ret = -1;
switch (s->src.type) {
case MSPH_TOKEN_SRC_FILE:
ret = fclose(s->src.inner.file.f);
break;
case MSPH_TOKEN_SRC_STR:
ret = 0;
break;
default:
break;
}
return (ret);
}
/* -1 or num tokens read */
ssize_t
msph_token_stream_read_tok(struct msph_token *ptr, size_t n,
struct msph_token_stream *s)
{
size_t ret;
int res;
ret = 0;
res = -1;
while (ret < n && (res = read_single_tok(&ptr[ret], s)) != 0) {
if (res == -1)
return (-1);
ret++;
}
return ((ssize_t)ret);
}
/* 1: matched token, 0: failed to match token, -1: error */
static int
read_single_tok(struct msph_token *ptr, struct msph_token_stream *s)
{
int res;
size_t m;
size_t max_m;
struct msph_ctx *ctx;
struct msph_token_src *src;
ctx = s->ctx;
src = &s->src;
/* Skipping whitespace */
if (tok_match(ctx, src, &wspace) == -1)
return (-1);
if (wspace.matchlen > 0 &&
tok_commit(ctx, src, &wspace, NULL) == -1)
return (-1);
max_m = 0;
for (m = 1; token_matcher[m].type != TOK_END; m++) {
res = tok_match(ctx, src, &token_matcher[m]);
SPHO_DEBUG_PRINT("read_single_tok: tok_match=%d\n", res);
if (res == -1)
return (-1);
if (res == 0 &&
token_matcher[m].matchlen > token_matcher[max_m].matchlen) {
max_m = m;
}
}
if (max_m == 0)
return (0);
SPHO_DEBUG_PRINT("read_single_tok: commit=%zu\n", max_m);
if (tok_commit(ctx, src, &token_matcher[max_m], ptr) == -1)
return (-1);
SPHO_DEBUG_PRINT("read_single_tok: committed\n");
return (1);
}
static ssize_t
src_file_fill_buf(struct msph_ctx *ctx, struct msph_token_src_file *file)
{
ssize_t ret;
size_t nread, maxread;
ret = nread = maxread = 0;
do {
if (file->end < file->pos)
maxread = file->pos - file->end;
else
maxread = BUF_LEN(file->buf) - file->end;
if (maxread == 0) {
MSPH_ERR(ctx, MSPH_ERR_TOOLONG);
return (-1);
}
nread = fread(&file->buf[file->end], sizeof(file->buf[0]),
maxread, file->f);
ret += nread;
file->end = (file->end + nread) % BUF_LEN(file->buf);
SPHO_DEBUG_PRINT("src_file_fill_buf: valid range (%zu, %zu)\n",
file->pos, file->end);
if (nread < maxread) {
if (ferror(file->f)) {
MSPH_ERR(ctx, MSPH_ERR_SYS);
return (-1);
}
break;
}
} while (file->end != file->pos);
SPHO_DEBUG_PRINT("src_file_fill_buf: read %zd\n", ret);
return (ret);
}
/* reads a single char from the circular buffer in src */
static int
file_char_at(struct msph_ctx *ctx, struct msph_token_src *src, size_t i,
char *out)
{
int ret;
ssize_t fill;
struct msph_token_src_file *file;
SPHO_PRECOND(src != NULL);
SPHO_PRECOND(src->type == MSPH_TOKEN_SRC_FILE);
ret = 0;
file = &src->inner.file;
fill = 0;
do {
SPHO_DEBUG_PRINT("want to read %zu, valid range (%zu, %zu)\n",
(file->pos + i) % BUF_LEN(file->buf), file->pos, file->end);
/* simplest case */
if (file->pos + i < file->end) {
*out = file->buf[file->pos + i];
return (1);
}
/* wrap around */
if (file->end < file->pos &&
((file->pos + i) % BUF_LEN(file->buf)) < file->end) {
*out = file->buf[(file->pos + i) % BUF_LEN(file->buf)];
return (1);
}
if (feof(file->f))
return (0);
if ((fill = src_file_fill_buf(ctx, file)) == -1)
return (-1);
} while (fill > 0 && ret++);
return (ret);
}
static int
char_at(struct msph_ctx *ctx, struct msph_token_src *src, size_t i, char *out)
{
int ret;
struct msph_token_src_str *str;
switch (src->type) {
case MSPH_TOKEN_SRC_FILE:
ret = file_char_at(ctx, src, i, out);
break;
case MSPH_TOKEN_SRC_STR:
str = &src->inner.str;
if (str->pos + i < str->len) {
*out = str->s[str->pos + i];
ret = 1;
} else {
ret = 0;
}
break;
default:
break;
}
SPHO_DEBUG_PRINT("char_at: ret=%d, *out=%c\n", ret, *out);
return (ret);
}
static int
fromcbuf_charcpy(char *dst, const char *src, size_t src_len, size_t src_pos,
size_t ncpy)
{
size_t cpy1, cpy2;
if (src_len < ncpy) {
return (-1);
}
cpy1 = (src_pos + ncpy < src_len) ? ncpy : src_len - src_pos;
cpy2 = ncpy - cpy1;
SPHO_DEBUG_PRINT("fromcbuf_charcpy: cpy1=%zu cpy2=%zu\n", cpy1, cpy2);
memcpy(dst, &src[src_pos], cpy1 * sizeof(src[0]));
if (! cpy2)
return (0);
memcpy(dst, &src[0], cpy2 * sizeof(src[0]));
return (0);
}
static int
tok_match(struct msph_ctx *ctx, struct msph_token_src *src,
struct msph_matcher *m)
{
int res;
int more;
char chr;
const char *match_str;
size_t off, len;
SPHO_PRECOND(m != NULL && src != NULL);
m->matchlen = 0;
#define MATCH_CHAR(c) \
do { \
SPHO_DEBUG_PRINT("tok_match: '%c'\n", c); \
if ((res = char_at(ctx, src, 0, &chr)) == -1) \
return (-1); \
else if (res == 0) \
return (0); \
SPHO_DEBUG_PRINT("tok_match: char_at(0)='%c'\n", c); \
\
if (chr == (c)) { \
m->matchlen = 1; \
} \
SPHO_DEBUG_PRINT("tok_match: matchlen=%zu\n", m->matchlen); \
return (0); \
} while (0)
#define MATCH_STR(str) \
do { \
match_str = str; \
len = strlen(match_str); \
for (off = 0; off < len; off++) { \
if ((res = char_at(ctx, src, off, &chr)) == -1) \
return (-1); \
else if (res == 0) \
return (0); \
\
if (chr != match_str[off]) \
break; \
} \
if (off == len) \
m->matchlen = len; \
return (0); \
} while (0)
switch (m->type) {
case TOK_LBRACE:
MATCH_CHAR('{');
case TOK_RBRACE:
MATCH_CHAR('}');
case TOK_LBRAK:
MATCH_CHAR('[');
case TOK_RBRAK:
MATCH_CHAR(']');
case TOK_LPAREN:
MATCH_CHAR('(');
case TOK_RPAREN:
MATCH_CHAR(')');
case TOK_COLON:
MATCH_CHAR(':');
case TOK_EQUALS:
MATCH_CHAR('=');
case TOK_AMP:
MATCH_CHAR('&');
case TOK_PIPE:
MATCH_CHAR('|');
case TOK_RARROW:
MATCH_STR("=>");
case TOK_SUB:
MATCH_STR("<:");
case TOK_KW_TYPE:
MATCH_STR("type");
case TOK_KW_NOMINAL:
MATCH_STR("nominal");
case TOK_KW_MEMBER:
MATCH_STR("member");
case TOK_KW_CHECK:
MATCH_STR("check");
case TOK_KW_BOX:
MATCH_STR("box");
case TOK_KW_FORALL:
MATCH_STR("forall");
case TOK_CONST_TRUE:
MATCH_STR("True");
case TOK_CONST_FALSE:
MATCH_STR("False");
case TOK_IDENT:
off = 0;
while ((res = char_at(ctx, src, off++, &chr)) == 1) {
if (! isalnum(chr))
break;
m->matchlen++;
}
if (res == -1)
return (-1);
return (0);
case TOK_WSPACE:
off = 0;
while((res = char_at(ctx, src, off++, &chr)) == 1) {
if (! isspace(chr))
break;
m->matchlen++;
}
if (res == -1)
return (-1);
return (0);
default:
SPHO_ASSERT(0);
return (-1);
break;
}
#undef MATCH_CHAR
#undef MATCH_STR
}
#define TOK_HAS_DATA(type) (type == TOK_IDENT)
static int
tok_commit(struct msph_ctx *ctx, struct msph_token_src *src,
struct msph_matcher *m, struct msph_token *ptr)
{
size_t pos_old;
struct msph_token_src_str *str;
struct msph_token_src_file *file;
SPHO_PRECOND(ctx != NULL && m != NULL);
SPHO_PRECOND(m->matchlen != 0);
SPHO_DEBUG_PRINT("committing\n");
switch (src->type) {
case MSPH_TOKEN_SRC_FILE:
file = &src->inner.file;
pos_old = file->pos;
SPHO_DEBUG_PRINT("committing\n");
file->pos += m->matchlen;
file->pos %= BUF_LEN(file->buf);
SPHO_ASSERT(file->pos < BUF_LEN(file->buf) ||
file->pos < pos_old);
SPHO_DEBUG_PRINT("committing\n");
if (ptr == NULL)
return (0);
ptr->type = m->type;
if (! TOK_HAS_DATA(ptr->type))
return (0);
SPHO_DEBUG_PRINT("committing\n");
if (m->matchlen >= sizeof(ptr->d.s.buf)) {
MSPH_ERR(ctx, MSPH_ERR_TOOLONG);
return (-1);
}
SPHO_DEBUG_PRINT("committing\n");
if (fromcbuf_charcpy(ptr->d.s.buf, file->buf, sizeof(file->buf),
pos_old, m->matchlen) == -1) {
MSPH_ERR(ctx, MSPH_ERR_TOOLONG);
return (-1);
}
SPHO_DEBUG_PRINT("committing\n");
ptr->d.s.buf[m->matchlen] = '\0';
return (0);
case MSPH_TOKEN_SRC_STR:
str = &src->inner.str;
pos_old = str->pos;
str->pos += m->matchlen;
SPHO_ASSERT(str->pos <= str->len);
if (ptr == NULL)
return (0);
ptr->type = m->type;
if (! TOK_HAS_DATA(ptr->type))
return (0);
if (m->matchlen >= sizeof(ptr->d.s.buf)) {
MSPH_ERR(ctx, MSPH_ERR_TOOLONG);
return (-1);
}
memcpy(ptr->d.s.buf, str->s, m->matchlen *
sizeof(str->s[0]));
ptr->d.s.buf[m->matchlen] = '\0';
return (0);
default:
return (-1);
}
}
static const char *
tok_base_str(struct msph_token *tok)
{
size_t i;
for (i = 0; token_info[i].type != TOK_END; i++) {
if (token_info[i].type == tok->type)
return (token_info[i].dbg_str);
}
return (NULL);
}