tokinizing

This commit is contained in:
Ellen Arvidsson 2025-04-15 13:50:23 +02:00
parent dd099f3382
commit 10e16147ba
3 changed files with 360 additions and 124 deletions

View file

@ -55,4 +55,5 @@ set(MSPH_HEADER
)
add_executable(msph ${MSPH_HEADER} ${MSPH_SRC})
target_include_directories(spho PRIVATE ${INCLUDE_DIR})
target_include_directories(msph PRIVATE ${INCLUDE_DIR})
target_link_libraries(devcheck spho)

View file

@ -3,9 +3,11 @@
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <ctype.h>
#include "spho/err.h"
#include "msph_token.h"
#define MSPH_ERR_SYS 0x0001
@ -14,6 +16,31 @@
#define MSPH_ERR(ctx, e) SPHO_ERR(ctx, e)
#define MSPH_TOKS_ERR(toks, e) MSPH_ERR((toks)->ctx, e)
struct msph_token_matcher {
size_t off;
size_t matchlen;
const int type;
} msph_matcher[] = {
{ 0, 0, TOK_START },
{ 0, 0, TOK_IDENT },
{ 0, 0, TOK_END }
};
static ssize_t src_file_fill_buf(struct msph_ctx *,
struct msph_token_src_file *);
static int tok_match(struct msph_ctx *, struct msph_token_src *,
struct msph_token_matcher *);
static int tok_commit(struct msph_ctx *, struct msph_token_src *,
struct msph_token_matcher *, struct msph_token *);
static int char_at(struct msph_ctx *, struct msph_token_src *, size_t,
char *);
static int fromcbuf_charcpy(char *, const char *, size_t, size_t, size_t);
static int file_char_at(struct msph_ctx *, struct msph_token_src *, size_t,
char *out);
static int read_single_tok(struct msph_token *, struct msph_token_stream *);
struct msph_token_stream *
msph_token_stream_fopen(struct msph_ctx *ctx, const char *path)
{
@ -34,13 +61,12 @@ msph_token_stream_fopen(struct msph_ctx *ctx, const char *path)
ret->ctx = ctx;
ret->src.type = MSPH_TOKEN_SRC_FILE;
ret->src.inner.file.f = f;
ret->src.inner.file.eof = 0;
ret->src.inner.file.pos = ret->src.file.buf;
ret->src.inner.file.end = ret->src.file.buf;
ret->src.inner.file.read_ptr = ret->src.file.buf;
ret->src.inner.file.pos = 0;
ret->src.inner.file.end = 0;
res = strlcpy(ret->src.file.name, path, sizeof(ret->src.file.name));
if (res >= sizeof(ret->src.file.name)) {
res = strlcpy(ret->src.inner.file.name, path,
sizeof(ret->src.inner.file.name));
if (res >= sizeof(ret->src.inner.file.name)) {
MSPH_ERR(ctx, MSPH_ERR_TOOLONG);
goto err;
}
@ -67,10 +93,10 @@ msph_token_stream_frombuf(struct msph_ctx *ctx, const char *buf, size_t len)
}
ret->ctx = ctx;
ret->type = MSPH_TOKEN_SRC_STR;
ret->src.str.s = buf;
ret->src.str.len = len;
ret->src.str.pos = 0;
ret->src.type = MSPH_TOKEN_SRC_STR;
ret->src.inner.str.s = buf;
ret->src.inner.str.len = len;
ret->src.inner.str.pos = 0;
return (ret);
}
@ -83,9 +109,9 @@ msph_token_stream_close(struct msph_token_stream *s)
ret = -1;
switch (s->type) {
switch (s->src.type) {
case MSPH_TOKEN_SRC_FILE:
ret = fclose(s->src.file.f);
ret = fclose(s->src.inner.file.f);
break;
case MSPH_TOKEN_SRC_STR:
ret = 0;
@ -97,7 +123,6 @@ msph_token_stream_close(struct msph_token_stream *s)
return (ret);
}
/* -1 or num tokens read */
ssize_t
msph_token_stream_read_tok(struct msph_token *ptr, size_t n,
@ -120,102 +145,6 @@ msph_token_stream_read_tok(struct msph_token *ptr, size_t n,
return (ret);
}
struct msph_token_matcher {
size_t pos_off;
size_t matchlen;
const int tok_type;
} msph_matcher[] = {
{ 0, 0, TOK_START },
{ 0, 0, TOK_IDENT },
{ 0, 0, TOK_END }
};
#define BUF_LEN(b) (sizeof(b) / sizeof(b[0]))
static int
file_char_at(struct msph_ctx *ctx, struct msph_token_src *src, size_t i,
char *out)
{
size_t nread;
size_t maxread;
struct msph_token_src_file *file;
ret = -1;
file = &src->inner.file;
if (file->pos + i < file->end) {
*out = file->buf[file->pos + i];
return (0);
}
if (file->end < file->pos &&
((file->pos + i) % BUF_LEN(file->buf)) < file->end) {
*out = file->buf[(file->pos + i) % BUF_LEN(file->buf)];
return (0);
}
if (file->eof) {
return (-1);
}
if (file->end < file->pos)
maxread = file->pos - file->end;
else
maxread = BUF_LEN(file->buf) - file->end;
maxread = file->end < file->pos ? file->pos - file->end :
BUF_LEN(file->buf) - file->end;
if (maxread == 0) {
MSPH_ERR(ctx, MSPH_ERR_TOOLONG);
return (-1);
}
ret = fread(&file->buf[file->end], sizeof(file->buf[0]), maxread,
file->f);
if (ret == 0) {
if (ferror(file->f)) {
MSPH_ERR(ctx, MSPH_ERR_SYS);
return (-1);
}
file->eof = 1;
return (-1);
}
}
static int
char_at(struct msph_token_src *src, size_t i, char *out)
{
int ret;
switch (src->type) {
case MSPH_TOKEN_SRC_FILE:
ret = file_char_at(s, i, out);
break;
case MSPH_TOKEN_SRC_STR:
ret = str_char_at(s, i, out);
break;
default:
break;
}
return (ret);
}
static int
tok_match(struct msph_token_matcher *m, struct msph_token_stream *s)
{
}
static void
tok_commit(struct msph_token *ptr, struct msph_token_stream *s,
struct msph_matcher *m)
{
SPHO_PRECOND(p != NULL && m != NULL);
SPHO_PRECOND(m->matchlen != 0);
}
/* 1: success, 0: failed match, -1: error */
static int
read_single_tok(struct msph_token *ptr, struct msph_token_stream *s)
@ -223,10 +152,15 @@ read_single_tok(struct msph_token *ptr, struct msph_token_stream *s)
int res;
size_t m;
size_t max_m;
struct msph_ctx *ctx;
struct msph_token_src *src;
ctx = s->ctx;
src = &s->src;
max_m = 0;
for (m = 0; msph_matcher[m].type != TOK_END; m++) {
res = tok_match(&msph_matcher[m], s);
res = tok_match(ctx, src, &msph_matcher[m]);
if (res == -1)
return (-1);
@ -240,8 +174,306 @@ read_single_tok(struct msph_token *ptr, struct msph_token_stream *s)
if (max_m == 0)
return (0);
tok_commit(ptr, &msph_matcher[max_m]);
if (tok_commit(ctx, src, &msph_matcher[max_m], ptr) == -1)
return (-1);
return (1);
}
#define BUF_LEN(b) (sizeof(b) / sizeof((b)[0]))
static ssize_t
src_file_fill_buf(struct msph_ctx *ctx, struct msph_token_src_file *file)
{
ssize_t ret;
size_t nread, maxread;
ret = nread = maxread = 0;
do {
if (file->end < file->pos)
maxread = file->pos - file->end;
else
maxread = BUF_LEN(file->buf) - file->end;
if (maxread == 0) {
MSPH_ERR(ctx, MSPH_ERR_TOOLONG);
return (-1);
}
nread = fread(&file->buf[file->end], sizeof(file->buf[0]),
maxread, file->f);
ret += nread;
file->end = (file->end + nread) % BUF_LEN(file->buf);
if (nread < maxread) {
if (ferror(file->f)) {
MSPH_ERR(ctx, MSPH_ERR_SYS);
return (-1);
}
break;
}
} while (file->end != file->pos);
return (ret);
}
/* reads a single char from the circular buffer in src */
static int
file_char_at(struct msph_ctx *ctx, struct msph_token_src *src, size_t i,
char *out)
{
int ret;
ssize_t fill;
struct msph_token_src_file *file;
SPHO_PRECOND(s != NULL);
SPHO_PRECOND(s->src.type == MSPH_TOKEN_SRC_FILE);
ret = 0;
file = &src->inner.file;
do {
/* simplest case */
if (file->pos + i < file->end) {
*out = file->buf[file->pos + i];
return (1);
}
/* wrap around */
if (file->end < file->pos &&
((file->pos + i) % BUF_LEN(file->buf)) < file->end) {
*out = file->buf[(file->pos + i) % BUF_LEN(file->buf)];
return (1);
}
if (feof(file->f))
return (0);
if (src_file_fill_buf(ctx, file) == -1)
return (-1);
} while (ret++);
return (-1);
}
static int
char_at(struct msph_ctx *ctx, struct msph_token_src *src, size_t i, char *out)
{
int ret;
struct msph_token_src_str *str;
switch (src->type) {
case MSPH_TOKEN_SRC_FILE:
return (file_char_at(ctx, src, i, out));
case MSPH_TOKEN_SRC_STR:
str = &src->inner.str;
if (str->pos + i < str->len) {
*out = str->s[str->pos + i];
return (1);
}
return (0);
default:
break;
}
return (ret);
}
static int
fromcbuf_charcpy(char *dst, const char *src, size_t src_len, size_t src_pos,
size_t ncpy)
{
size_t cpy1, cpy2;
if (src_len < ncpy) {
return (-1);
}
cpy1 = src_pos + ncpy < src_len ? ncpy : src_len - src_pos;
cpy2 = ncpy - cpy1;
memcpy(dst, &src[src_pos], cpy1 * sizeof(src[0]));
if (! cpy2)
return (0);
memcpy(dst, &src[0], cpy2 * sizeof(src[0]));
return (0);
}
static int
tok_match(struct msph_ctx *ctx, struct msph_token_src *src,
struct msph_token_matcher *m)
{
int res;
int more;
char chr;
const char *match_str;
size_t off, len;
SPHO_PRECOND(m != NULL && s != NULL);
m->matchlen = 0;
#define MATCH_CHAR(c) \
do { \
if ((res = char_at(ctx, src, 0, &chr)) == -1) \
return (-1); \
else if (res == 0) \
return (0); \
\
if (chr == (c)) { \
m->matchlen = 1; \
} \
return (0); \
} while (0)
#define MATCH_STR(str) \
do { \
match_str = str; \
len = strlen(match_str); \
for (off = 0; off < len; off++) { \
if ((res = char_at(ctx, src, off, &chr)) == -1) \
return (-1); \
else if (res == 0) \
return (0); \
\
if (chr != match_str[off]) \
break; \
} \
if (off == len) \
m->matchlen = len; \
return (0); \
} while (0)
switch (m->type) {
case TOK_LBRACE:
MATCH_CHAR('{');
case TOK_RBRACE:
MATCH_CHAR('}');
case TOK_LBRAK:
MATCH_CHAR('[');
case TOK_RBRAK:
MATCH_CHAR(']');
case TOK_LPAREN:
MATCH_CHAR('(');
case TOK_RPAREN:
MATCH_CHAR(')');
case TOK_COLON:
MATCH_CHAR(':');
case TOK_EQUALS:
MATCH_CHAR('=');
case TOK_AMP:
MATCH_CHAR('&');
case TOK_PIPE:
MATCH_CHAR('|');
case TOK_RARROW:
MATCH_STR("=>");
case TOK_SUB:
MATCH_STR("<:");
case TOK_KW_TYPE:
MATCH_STR("type");
case TOK_KW_NOMINAL:
MATCH_STR("nominal");
case TOK_KW_MEMBER:
MATCH_STR("member");
case TOK_KW_CHECK:
MATCH_STR("check");
case TOK_KW_BOX:
MATCH_STR("box");
case TOK_KW_FORALL:
MATCH_STR("forall");
case TOK_CONST_TRUE:
MATCH_STR("True");
case TOK_CONST_FALSE:
MATCH_STR("False");
case TOK_IDENT:
off = 0;
while ((res = char_at(ctx, src, off++, &chr)) == 1) {
if (! isalnum(chr))
break;
m->matchlen++;
}
if (res == -1)
return (-1);
return (0);
default:
SPHO_ASSERT(0);
return (-1);
break;
}
#undef MATCH_CHAR
#undef MATCH_STR
}
#define TOK_HAS_DATA(type) (type == TOK_IDENT)
static int
tok_commit(struct msph_ctx *ctx, struct msph_token_src *src,
struct msph_token_matcher *m, struct msph_token *ptr)
{
size_t pos_old;
struct msph_token_src_str *str;
struct msph_token_src_file *file;
SPHO_PRECOND(p != NULL && m != NULL);
SPHO_PRECOND(m->matchlen != 0);
switch (src->type) {
case MSPH_TOKEN_SRC_FILE:
file = &src->inner.file;
pos_old = file->pos;
file->pos += m->matchlen;
file->pos %= BUF_LEN(file->buf);
SPHO_ASSERT(file->pos < BUF_LEN(file->buf) ||
file->pos < pos_old);
ptr->type = m->type;
if (! TOK_HAS_DATA(ptr->type))
return (0);
if (m->matchlen >= sizeof(ptr->d.s.buf)) {
MSPH_ERR(ctx, MSPH_ERR_TOOLONG);
return (-1);
}
if (fromcbuf_charcpy(ptr->d.s.buf, file->buf, sizeof(file->buf),
pos_old, m->matchlen) == -1) {
MSPH_ERR(ctx, MSPH_ERR_TOOLONG);
return (-1);
}
ptr->d.s.buf[m->matchlen] = '\0';
return (0);
case MSPH_TOKEN_SRC_STR:
str = &src->inner.str;
pos_old = str->pos;
str->pos += m->matchlen;
SPHO_ASSERT(str->pos <= str->len);
ptr->type = m->type;
if (! TOK_HAS_DATA(ptr->type))
return (0);
if (m->matchlen >= sizeof(ptr->d.s.buf)) {
MSPH_ERR(ctx, MSPH_ERR_TOOLONG);
return (-1);
}
memcpy(ptr->d.s.buf, str->s, m->matchlen *
sizeof(str->s[0]));
ptr->d.s.buf[m->matchlen] = '\0';
return (0);
default:
return (-1);
}
}

View file

@ -92,13 +92,13 @@ enum msph_tok_type {
TOK_RBRAK, // ]
TOK_LPAREN, // (
TOK_RPAREN, // )
TOK_OP_COLON, // :
TOK_OP_EQUALS, // =
TOK_COLON, // :
TOK_EQUALS, // =
TOK_OP_AMP, // &
TOK_OP_PIPE, // |
TOK_OP_RARROW, // =>
TOK_OP_SUB, // <:
TOK_AMP, // &
TOK_PIPE, // |
TOK_RARROW, // =>
TOK_SUB, // <:
TOK_KW_TYPE, // type
TOK_KW_NOMINAL, // nominal
@ -135,12 +135,13 @@ struct msph_token {
struct msph_token_src_file {
FILE *f;
int eof;
size_t pos; // TODO rename bufpos
size_t end; // TODO rename bufend
size_t read_pos;
/* circular buffer for reading */
size_t pos;
size_t end;
char buf[MSPH_FILE_BUF_LEN];
/* file path */
char name[MSPH_PATH_LEN];
};
@ -157,7 +158,7 @@ union msph_token_src_data {
};
struct msph_token_src {
int type
int type;
union msph_token_src_data inner;
};
@ -174,6 +175,8 @@ struct msph_token_stream *msph_token_stream_frombuf(struct msph_ctx *,
int msph_token_stream_close(struct msph_token_stream*);
struct msph_token *msph_token_source_pop(struct msph_token_stream *);
ssize_t msph_token_stream_read_tok(struct msph_token *, size_t,
struct msph_token_stream *);
#endif /* _MSPH_EXPR_H */