log-e-sappho/src/msph/token.c

778 lines
15 KiB
C

#include <sys/errno.h>
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <ctype.h>
#include "msph/err.h"
#include "msph/token.h"
struct msph_matcher {
size_t off;
size_t matchlen;
const int type;
};
struct msph_matcher token_matchers[] = {
{ 0, 0, -1 },
{ 0, 0, TOK_LBRACE },
{ 0, 0, TOK_RBRACE },
{ 0, 0, TOK_LBRAK },
{ 0, 0, TOK_RBRAK },
{ 0, 0, TOK_LPAREN },
{ 0, 0, TOK_RPAREN },
{ 0, 0, TOK_COLON },
{ 0, 0, TOK_EQUALS },
{ 0, 0, TOK_COMMA },
{ 0, 0, TOK_DOT },
{ 0, 0, TOK_AMP },
{ 0, 0, TOK_PIPE },
{ 0, 0, TOK_IMPL },
{ 0, 0, TOK_RARROW },
{ 0, 0, TOK_SUB },
{ 0, 0, TOK_KW_TYPE },
{ 0, 0, TOK_KW_NOMINAL },
{ 0, 0, TOK_KW_MEMBER },
{ 0, 0, TOK_KW_ASSERT },
{ 0, 0, TOK_KW_BOX },
{ 0, 0, TOK_KW_FORALL },
{ 0, 0, TOK_CONST_TRUE },
{ 0, 0, TOK_CONST_FALSE },
{ 0, 0, TOK_IDENT },
{ 0, 0, TOK_END }
};
struct msph_matcher wspace = { 0, 0, TOK_WSPACE };
struct msph_token_info {
const int type;
const char *dbg_str;
const char *str;
} token_info[] = {
#define TOK_INFO(tok, s) { tok , #tok, s }
TOK_INFO(TOK_LBRACE, "{"),
TOK_INFO(TOK_RBRACE, "}"),
TOK_INFO(TOK_LBRAK, "["),
TOK_INFO(TOK_RBRAK, "]"),
TOK_INFO(TOK_LPAREN, "("),
TOK_INFO(TOK_RPAREN, ")"),
TOK_INFO(TOK_COLON, ":"),
TOK_INFO(TOK_EQUALS, "="),
TOK_INFO(TOK_COMMA, ","),
TOK_INFO(TOK_DOT, "."),
TOK_INFO(TOK_AMP, "&"),
TOK_INFO(TOK_PIPE, "|"),
TOK_INFO(TOK_IMPL, "=>"),
TOK_INFO(TOK_RARROW, "->"),
TOK_INFO(TOK_SUB, "<:"),
TOK_INFO(TOK_KW_TYPE, "type"),
TOK_INFO(TOK_KW_NOMINAL, "nominal"),
TOK_INFO(TOK_KW_MEMBER, "member"),
TOK_INFO(TOK_KW_ASSERT, "assert"),
TOK_INFO(TOK_KW_BOX, "box"),
TOK_INFO(TOK_KW_FORALL, "forall"),
TOK_INFO(TOK_CONST_TRUE, "True"),
TOK_INFO(TOK_CONST_FALSE, "False"),
TOK_INFO(TOK_IDENT, NULL),
{ TOK_END , NULL, NULL }
#undef TOK_INFO
};
#define BUF_LEN(b) ((sizeof(b) / sizeof((b)[0])))
static ssize_t src_file_fill_buf(struct msph_ctx *,
struct msph_token_src_file *);
static int tok_match(struct msph_ctx *, struct msph_token_src *,
struct msph_matcher *);
static int tok_commit(struct msph_ctx *, struct msph_token_src *,
struct msph_matcher *, struct msph_token *);
static void tok_update_pos(struct msph_ctx *, struct msph_token_src *,
struct msph_matcher *m);
static int char_at(struct msph_ctx *, struct msph_token_src *, size_t,
char *);
static int fromcbuf_charcpy(char *, const char *, size_t, size_t, size_t);
static int file_char_at(struct msph_ctx *, struct msph_token_src *, size_t,
char *out);
static int read_single_tok(struct msph_token *, struct msph_token_stream *);
static const char *tok_base_str(struct msph_token *);
struct msph_token_stream *
msph_token_stream_file(struct msph_ctx *ctx, const char *name, FILE *f)
{
size_t res;
struct msph_token_stream *ret;
if (ctx == NULL || f == NULL) {
MSPH_ERR(ctx, MSPH_ERR_INVAL);
return (NULL);
}
if ((ret = calloc(1, sizeof(struct msph_token_stream))) == NULL) {
MSPH_ERR(ctx, MSPH_ERR_SYS);
goto err;
}
ret->ctx = ctx;
if ((res = strlcpy(ret->name, name, BUF_LEN(ret->name)))
>= BUF_LEN(ret->name)) {
MSPH_ERR(ctx, MSPH_ERR_TOOLONG);
goto err;
}
ret->src.type = MSPH_TOKEN_SRC_FILE;
ret->src.pos = (struct msph_text_pos) { .line = 1, .col = 1 };
ret->src.inner.file.f = f;
ret->src.inner.file.pos = 0;
ret->src.inner.file.end = 0;
return (ret);
err:
if (fclose(f) == EOF)
abort();
if (ret != NULL)
free(ret);
return (NULL);
}
struct msph_token_stream *
msph_token_stream_frombuf(struct msph_ctx *ctx, const char *name,
const char *buf, size_t len)
{
size_t res;
struct msph_token_stream *ret;
if ((ret = calloc(1, sizeof(struct msph_token_stream))) == NULL) {
MSPH_ERR(ctx, MSPH_ERR_SYS);
return (NULL);
}
ret->ctx = ctx;
if ((res = strlcpy(ret->name, name, BUF_LEN(ret->name)))
>= BUF_LEN(ret->name)) {
MSPH_ERR(ctx, MSPH_ERR_TOOLONG);
goto err;
}
ret->src.type = MSPH_TOKEN_SRC_STR;
ret->src.pos = (struct msph_text_pos) { .line = 1, .col = 1 };
ret->src.inner.str.s = buf;
ret->src.inner.str.len = strnlen(buf, len);
ret->src.inner.str.pos = 0;
return (ret);
err:
free(ret);
return (NULL);
}
ssize_t
msph_token_str(char *buf, size_t len,
struct msph_token *tok)
{
ssize_t ret;
const char *base;
base = tok_base_str(tok);
if (base == NULL) {
return (-1);
}
ret = (ssize_t)snprintf(buf, len, "%s", base);
if (ret < 0 || ret >= (ssize_t)len)
return (ret);
len -= (size_t)ret;
buf += ret;
switch (tok->type) {
case TOK_IDENT:
ret += (ssize_t)snprintf(buf, len, "(%s)", tok->data.str);
break;
default:
break;
}
return (ret);
}
#define MSPH_TOKEN_PRINT_BUF_LEN 2 * MSPH_IDENT_LEN
int
msph_token_stream_print(struct msph_token_stream *s, FILE *out)
{
ssize_t ret;
struct msph_token tok;
char tokstr[MSPH_TOKEN_PRINT_BUF_LEN];
while ((ret = msph_token_stream_read(&tok, 1, s)) > 0) {
ret = msph_token_str(tokstr, BUF_LEN(tokstr), &tok);
if (ret < 0) {
MSPH_ERR_INFO(s->ctx, MSPH_ERR_TOKEN_INVAL, tok.type);
break;
}
if ((size_t)ret < BUF_LEN(tokstr))
fprintf(out, "%s\n", tokstr);
else
fprintf(out, "%s...(trunkated)", tokstr);
}
return ((int)ret);
}
int
msph_token_stream_close(struct msph_token_stream *s)
{
int ret;
ret = -1;
switch (s->src.type) {
case MSPH_TOKEN_SRC_FILE:
ret = fclose(s->src.inner.file.f);
break;
case MSPH_TOKEN_SRC_STR:
ret = 0;
break;
default:
break;
}
return (ret);
}
/* read at most n tokens from s into p.
* return -1 on error, or num tokens read
*/
ssize_t
msph_token_stream_read(struct msph_token *ptr, size_t n,
struct msph_token_stream *s)
{
size_t ret;
int res;
ret = 0;
res = -1;
while (ret < n && (res = read_single_tok(&ptr[ret], s)) != 0) {
if (res == -1)
return (-1);
ret++;
}
return ((ssize_t)ret);
}
/* 1: matched token, 0: failed to match token, -1: error */
static int
read_single_tok(struct msph_token *ptr, struct msph_token_stream *s)
{
int res;
size_t m;
size_t max_m;
struct msph_ctx *ctx;
struct msph_token_src *src;
ctx = s->ctx;
src = &s->src;
/* Skipping whitespace */
if (tok_match(ctx, src, &wspace) == -1)
return (-1);
SPHO_DEBUG_PRINT("wspace.matchlen=%zu\n", wspace.matchlen);
if (wspace.matchlen > 0 &&
tok_commit(ctx, src, &wspace, NULL) == -1)
return (-1);
max_m = 0;
for (m = 1; token_matchers[m].type != TOK_END; m++) {
res = tok_match(ctx, src, &token_matchers[m]);
if (res == -1)
return (-1);
if (res == 0 && token_matchers[m].matchlen >
token_matchers[max_m].matchlen) {
max_m = m;
}
}
if (max_m == 0) {
if (msph_token_stream_eof(s))
return (0);
MSPH_ERR(s->ctx, MSPH_ERR_TOKEN_NOMATCH);
return (-1);
}
if (tok_commit(ctx, src, &token_matchers[max_m], ptr) == -1)
return (-1);
return (1);
}
int
msph_token_stream_eof(struct msph_token_stream *s)
{
struct msph_token_src_file *file;
struct msph_token_src_str *str;
switch (s->src.type) {
case MSPH_TOKEN_SRC_FILE:
file = &s->src.inner.file;
return (file->pos == file->end && feof(file->f));
case MSPH_TOKEN_SRC_STR:
str = &s->src.inner.str;
return (str->pos == str->len);
default:
MSPH_ERR(s->ctx, MSPH_ERR_INVAL);
return (-1);
}
}
struct msph_token *
msph_token_copy(struct msph_ctx *ctx, struct msph_token *token)
{
size_t i;
struct msph_token *copy;
struct msph_token_info *info;
info = NULL;
for (i = 0; token_info[i].type != TOK_END; i++) {
if (token_info[i].type == token->type) {
info = &token_info[i];
break;
}
}
if (info == NULL) {
MSPH_ERR_INFO(ctx, MSPH_ERR_TOKEN_INVAL, token->type);
return (NULL);
}
if ((copy = malloc(sizeof(*copy))) == NULL) {
MSPH_ERR(ctx, MSPH_ERR_SYS);
return (NULL);
}
memcpy(copy, token, sizeof(*copy));
return (copy);
}
static ssize_t
src_file_fill_buf(struct msph_ctx *ctx, struct msph_token_src_file *file)
{
ssize_t ret;
size_t nread, maxread;
ret = nread = maxread = 0;
do {
if (file->end < file->pos)
maxread = file->pos - file->end;
else
maxread = BUF_LEN(file->buf) - file->end;
if (maxread == 0) {
MSPH_ERR(ctx, MSPH_ERR_TOKEN_TOOLONG);
return (-1);
}
nread = fread(&file->buf[file->end], sizeof(file->buf[0]),
maxread, file->f);
ret += nread;
file->end = (file->end + nread) % BUF_LEN(file->buf);
SPHO_DEBUG_PRINT("src_file_fill_buf: valid range (%zu, %zu)\n",
file->pos, file->end);
if (nread < maxread) {
if (ferror(file->f)) {
MSPH_ERR(ctx, MSPH_ERR_SYS);
return (-1);
}
break;
}
} while (file->end != file->pos);
return (ret);
}
/* reads a single char from the circular buffer in src */
static int
file_char_at(struct msph_ctx *ctx, struct msph_token_src *src, size_t i,
char *out)
{
ssize_t fill;
struct msph_token_src_file *file;
SPHO_PRECOND(src != NULL);
SPHO_PRECOND(src->type == MSPH_TOKEN_SRC_FILE);
file = &src->inner.file;
fill = 0;
do {
/* simplest case */
if (file->pos + i < file->end) {
*out = file->buf[file->pos + i];
return (1);
}
/* wrap around */
if (file->end < file->pos &&
((file->pos + i) % BUF_LEN(file->buf)) < file->end) {
*out = file->buf[(file->pos + i) % BUF_LEN(file->buf)];
return (1);
}
if (feof(file->f))
return (0);
if ((fill = src_file_fill_buf(ctx, file)) == -1)
return (-1);
} while (fill > 0);
return (0);
}
static int
char_at(struct msph_ctx *ctx, struct msph_token_src *src, size_t i, char *out)
{
int ret;
struct msph_token_src_str *str;
ret = -1;
switch (src->type) {
case MSPH_TOKEN_SRC_FILE:
ret = file_char_at(ctx, src, i, out);
break;
case MSPH_TOKEN_SRC_STR:
str = &src->inner.str;
if (str->pos + i < str->len) {
*out = str->s[str->pos + i];
ret = 1;
} else {
ret = 0;
}
break;
default:
break;
}
#ifdef SPHO_ENABLE_DEBUG_PRINT
if (isspace(*out)) {
const char *charrep;
switch (*out) {
case '\n':
charrep = "\\n";
break;
case '\t':
charrep = "\\t";
break;
case '\r':
charrep = "\\r";
break;
case '\v':
charrep = "\\v";
break;
case '\f':
charrep = "\\f";
break;
default:
charrep = "WOOOOOOOOOOPS";
break;
}
SPHO_DEBUG_PRINT("char_at: ret=%d, *out=%s\n", ret, charrep);
} else {
SPHO_DEBUG_PRINT("char_at: ret=%d, *out=%c\n", ret, *out);
}
#endif
return (ret);
}
static int
fromcbuf_charcpy(char *dst, const char *src, size_t src_len, size_t src_pos,
size_t ncpy)
{
size_t cpy1, cpy2;
if (src_len < ncpy) {
return (-1);
}
cpy1 = (src_pos + ncpy < src_len) ? ncpy : src_len - src_pos;
cpy2 = ncpy - cpy1;
SPHO_DEBUG_PRINT("fromcbuf_charcpy: cpy1=%zu cpy2=%zu\n", cpy1, cpy2);
memcpy(dst, &src[src_pos], cpy1 * sizeof(src[0]));
if (! cpy2)
return (0);
memcpy(dst, &src[0], cpy2 * sizeof(src[0]));
return (0);
}
static int
tok_match(struct msph_ctx *ctx, struct msph_token_src *src,
struct msph_matcher *m)
{
int res;
char chr;
const char *match_str;
size_t off, len;
SPHO_PRECOND(m != NULL && src != NULL);
m->matchlen = 0;
#define MATCH_CHAR(c) \
do { \
SPHO_DEBUG_PRINT("tok_match: '%c'\n", c); \
if ((res = char_at(ctx, src, 0, &chr)) == -1) \
return (-1); \
else if (res == 0) \
return (0); \
SPHO_DEBUG_PRINT("tok_match: char_at(0)='%c'\n", c); \
\
if (chr == (c)) { \
m->matchlen = 1; \
} \
SPHO_DEBUG_PRINT("tok_match: matchlen=%zu\n", m->matchlen); \
return (0); \
} while (0)
#define MATCH_STR(str) \
do { \
match_str = str; \
len = strlen(match_str); \
for (off = 0; off < len; off++) { \
if ((res = char_at(ctx, src, off, &chr)) == -1) \
return (-1); \
else if (res == 0) \
return (0); \
\
if (chr != match_str[off]) \
break; \
} \
if (off == len) \
m->matchlen = len; \
return (0); \
} while (0)
switch (m->type) {
case TOK_LBRACE:
MATCH_CHAR('{');
case TOK_RBRACE:
MATCH_CHAR('}');
case TOK_LBRAK:
MATCH_CHAR('[');
case TOK_RBRAK:
MATCH_CHAR(']');
case TOK_LPAREN:
MATCH_CHAR('(');
case TOK_RPAREN:
MATCH_CHAR(')');
case TOK_COLON:
MATCH_CHAR(':');
case TOK_DOT:
MATCH_CHAR('.');
case TOK_COMMA:
MATCH_CHAR(',');
case TOK_EQUALS:
MATCH_CHAR('=');
case TOK_AMP:
MATCH_CHAR('&');
case TOK_PIPE:
MATCH_CHAR('|');
case TOK_IMPL:
MATCH_STR("=>");
case TOK_RARROW:
MATCH_STR("->");
case TOK_SUB:
MATCH_STR("<:");
case TOK_KW_TYPE:
MATCH_STR("type");
case TOK_KW_NOMINAL:
MATCH_STR("nominal");
case TOK_KW_MEMBER:
MATCH_STR("member");
case TOK_KW_ASSERT:
MATCH_STR("assert");
case TOK_KW_BOX:
MATCH_STR("box");
case TOK_KW_FORALL:
MATCH_STR("forall");
case TOK_CONST_TRUE:
MATCH_STR("True");
case TOK_CONST_FALSE:
MATCH_STR("False");
case TOK_IDENT:
off = 0;
while ((res = char_at(ctx, src, off++, &chr)) == 1) {
if (! isalnum(chr))
break;
m->matchlen++;
}
if (res == -1)
return (-1);
return (0);
case TOK_WSPACE:
off = 0;
while((res = char_at(ctx, src, off++, &chr)) == 1) {
if (! isspace(chr))
break;
m->matchlen++;
}
if (res == -1)
return (-1);
return (0);
default:
SPHO_ASSERT(0);
return (-1);
break;
}
#undef MATCH_CHAR
#undef MATCH_STR
}
#define TOK_HAS_DATA(type) (type == TOK_IDENT)
static int
tok_commit(struct msph_ctx *ctx, struct msph_token_src *src,
struct msph_matcher *m, struct msph_token *ptr)
{
size_t pos_old;
struct msph_text_pos tok_pos;
struct msph_token_src_str *str;
struct msph_token_src_file *file;
SPHO_PRECOND(ctx != NULL && m != NULL);
SPHO_PRECOND(m->matchlen != 0);
tok_pos = src->pos;
tok_update_pos(ctx, src, m);
switch (src->type) {
case MSPH_TOKEN_SRC_FILE:
file = &src->inner.file;
pos_old = file->pos;
file->pos += m->matchlen;
file->pos %= BUF_LEN(file->buf);
SPHO_ASSERT(file->pos <= file->end ||
(file->pos < pos_old && file->pos < BUF_LEN(file->buf)));
if (ptr == NULL)
return (0);
ptr->type = m->type;
ptr->pos = tok_pos;
if (! TOK_HAS_DATA(ptr->type))
return (0);
if (m->matchlen >= sizeof(ptr->data.str)) {
MSPH_ERR(ctx, MSPH_ERR_TOKEN_TOOLONG);
return (-1);
}
if (fromcbuf_charcpy(ptr->data.str, file->buf,
sizeof(file->buf), pos_old, m->matchlen) == -1) {
MSPH_ERR(ctx, MSPH_ERR_TOKEN_TOOLONG);
return (-1);
}
ptr->data.str[m->matchlen] = '\0';
return (0);
case MSPH_TOKEN_SRC_STR:
str = &src->inner.str;
pos_old = str->pos;
str->pos += m->matchlen;
SPHO_ASSERT(str->pos <= str->len);
if (ptr == NULL)
return (0);
ptr->type = m->type;
ptr->pos = tok_pos;
if (! TOK_HAS_DATA(ptr->type))
return (0);
if (m->matchlen >= sizeof(ptr->data.str)) {
MSPH_ERR(ctx, MSPH_ERR_TOKEN_TOOLONG);
return (-1);
}
memcpy(ptr->data.str, str->s, m->matchlen *
sizeof(str->s[0]));
ptr->data.str[m->matchlen] = '\0';
return (0);
default:
return (-1);
}
}
static void
tok_update_pos(struct msph_ctx *ctx, struct msph_token_src *src,
struct msph_matcher *m)
{
int res;
char c;
size_t i;
for (i = 0; i < m->matchlen; i++) {
res = char_at(ctx, src, i, &c);
SPHO_ASSERT(res == 1);
switch (c) {
case '\t':
src->pos.col += MSPH_TAB_WIDTH;
break;
case '\n':
src->pos.line++;
src->pos.col = 1;
break;
case '\r':
break;
default:
src->pos.col++;
break;
}
}
}
static const char *
tok_base_str(struct msph_token *tok)
{
size_t i;
for (i = 0; token_info[i].type != TOK_END; i++) {
if (token_info[i].type == tok->type)
return (token_info[i].dbg_str);
}
return (NULL);
}