Ben/Nujel - lib/reader.c

Ben (X13/Arch) Nicer report 0869b7c 2 years ago 1012 Commits
Raw
text/plain • 19.28 KB • 641 lines
/* Nujel - Copyright (C) 2020-2022 - Benjamin Vincent Schulenburg
 * This project uses the MIT license, a copy should be included under /LICENSE
 */
#ifndef NUJEL_AMALGAMATION
#include "nujel-private.h"
#endif

#include <ctype.h>
#include <limits.h>
#include <math.h>
#include <stdlib.h>
#include <string.h>

#define isparen(v) (((v) == '(') || ((v) == ')'))
#define isbracket(v) (((v) == '[') || ((v) == ']'))
#define isbrace(v) (((v) == '{') || ((v) == '}'))
#define isClosingChar(v) (((v) == ')') || ((v) == ']') || ((v) == '}'))
#define isnonsymbol(v)                                                                                                 \
    (isparen(v) || isbracket(v) || isbrace(v) || ((v) == '#') || ((v) == '\'') || ((v) == '\"') || ((v) == '`') ||     \
     ((v) == ';'))
#define isnumericseparator(v) (((v) == '_') || ((v) == ','))

typedef struct {
    const char *buf, *bufEnd, *data;
    lClosure *c;
} lReadContext;

static lVal lReadValue(lReadContext *s);
static lVal lReadList(lReadContext *s, bool rootForm, char terminator);

static lVal lValExceptionReaderCustom(lReadContext *s, const char *msg, const char *customError) {
    lVal err = lValStringError(s->buf, s->bufEnd, MAX(s->buf, s->bufEnd - 30), s->bufEnd, s->bufEnd);
    return lValException(customError, msg, err);
}

static lVal lValExceptionReader(lReadContext *s, const char *msg) {
    return lValExceptionReaderCustom(s, msg, "read-error");
}

static lVal lValExceptionReaderStartEnd(lReadContext *s, const char *msg) {
    const char *start, *end;
    for (start = s->data; (start > s->buf) && (*start != '"') && ((start <= s->buf) || (start[-1] != '\\')); start--) {
    }
    for (end = s->data; (end < s->bufEnd) && (*end != '"') && (end[-1] != '\\'); end++) {
    }
    return lValException("read-error", msg, lValStringError(s->buf, s->bufEnd, start, s->data, end));
}

static lVal lValExceptionReaderEnd(lReadContext *s, const char *start, const char *msg) {
    const char *end;
    for (end = s->data; (end < s->bufEnd) && ((*end > ' ') && !isnonsymbol(*end)); end++) {
    }
    return lValException("read-error", msg, lValStringError(s->buf, s->bufEnd, start, s->data, end));
}

static double createFloat(i64 value, i64 mantissa, i64 mantissaLeadingZeroes) {
    if (mantissa == 0) {
        return value;
    }
    const double mant = mantissa * pow(10, -(floor(log10(mantissa)) + 1 + mantissaLeadingZeroes));
    return value + mant;
}

static void lStringAdvanceToNextCharacter(lReadContext *s) {
    for (; (s->data < s->bufEnd) && (isspace((u8)*s->data)); s->data++) {
    }
}

static void lStringAdvanceToNextSpaceOrSpecial(lReadContext *s) {
    for (; (s->data < s->bufEnd) && (!isspace((u8)*s->data)); s->data++) {
        const u8 c = *s->data;
        if (isnonsymbol(c)) {
            break;
        }
        if (*s->data == ':') {
            break;
        }
    }
}
static void lStringAdvanceToNextLine(lReadContext *s) {
    for (; (s->data < s->bufEnd) && (*s->data != '\n'); s->data++) {
    }
}

static void lStringAdvanceUntilEndOfBlockComment(lReadContext *s) {
    const char *end = s->bufEnd - 1;
    for (; (s->data < end); s->data++) {
        if ((s->data[0] == '#') && (s->data[1] == '|')) {
            s->data += 2;
            lStringAdvanceUntilEndOfBlockComment(s);
        } else if ((s->data[0] == '|') && (s->data[1] == '#')) {
            s->data += 2;
            return;
        }
    }
}

/* Parse the string literal in s and return the resulting ltString lVal */
static lVal lParseString(lReadContext *s) {
    static char *buf = NULL;
    static uint bufSize = 1 << 12; // Start with 4K
    if (buf == NULL) {
        buf = malloc(bufSize);
    }
    if (buf == NULL) {
        exit(20);
    }
    char *b = buf;
    uint i = 0;
    while (s->data < s->bufEnd) {
        if (unlikely(++i >= bufSize)) {
            bufSize *= 2;
            buf = realloc(buf, bufSize);
            if (buf == NULL) {
                exit(21);
            }
            b = &buf[i];
        }
        if (unlikely(*s->data == '\\')) {
            s->data++;
            switch (*s->data) {
            case '0':
                *b++ = 0;
                break;
            case 'a':
                *b++ = '\a';
                break;
            case 'b':
                *b++ = '\b';
                break;
            case 't':
                *b++ = '\t';
                break;
            case 'n':
                *b++ = '\n';
                break;
            case 'v':
                *b++ = '\v';
                break;
            case 'f':
                *b++ = '\f';
                break;
            case 'r':
                *b++ = '\r';
                break;
            case 'e':
                *b++ = 0x1B; // Has to be hardcoded due to OpenWatcom
                break;
            case '"':
                *b++ = '"';
                break;
            case '\\':
                *b++ = '\\';
                break;
            default:
                return lValExceptionReaderStartEnd(s, "Unknown escape character");
            }
            s->data++;
        } else if (unlikely(*s->data == '"')) {
            s->data++;
            return lValAlloc(ltString, lStringNew(buf, b - buf));
        } else if (unlikely(*s->data == 0)) {
            if (likely(i < bufSize)) {
                buf[i] = 0;
            }
            return lValException("read-error", "Can't find closing \"", lValString(buf));
        } else {
            *b++ = *s->data++;
        }
    }
    if (likely(i < bufSize)) {
        buf[i] = 0;
    }
    return lValException("read-error", "Can't find closing \"", lValString(buf));
}

/* Parse s as a symbol and return the ltSymbol lVal */
static lVal lParseSymbol(lReadContext *s) {
    uint i;
    char buf[128];
    bool keyword = false;
    const char *start = s->data;
    for (i = 0; i < (sizeof(buf) - 1); i++) {
        const char c = *s->data++;
        if (c == ':') {
            keyword = true;
            if (i > 0) {
                const char cc = *s->data++;
                if ((cc == 0) || isspace((u8)cc) || isnonsymbol(cc)) {
                    s->data--;
                    break;
                }
                return lValExceptionReaderEnd(s, start, "Can't have a colon there");
            }
        }
        if ((c == 0) || isspace((u8)c) || isnonsymbol(c)) {
            s->data--;
            break;
        }
        buf[i] = c;
    }
    buf[i] = 0;
    while (isspace((u8)*s->data)) {
        if (*s->data == 0) {
            break;
        }
        s->data++;
    }

    char *kwstart = buf;
    if (unlikely(i > 0) && (buf[i - 1] == ':')) {
        buf[i - 1] = 0;
    }
    if (buf[0] == ':') {
        kwstart = &buf[1];
    }
    if (unlikely(*start == 0)) {
        return lValExceptionReaderEnd(s, kwstart, "Sym/KW too short");
    }
    return keyword ? lValKeyword(kwstart) : lValSym(buf);
}

static lVal lParseNumberBase(lReadContext *s, int *leadingZeroes, int base, int maxDigits) {
    i64 ret = 0;
    int zeroes = 0, digits = 0;
    const char *start = s->data;

    for (; s->data < s->bufEnd; s->data++) {
        const u8 c = tolower(*s->data);
        if ((c <= ' ') || isnonsymbol(c) || (c == '.')) {
            break;
        }

        int curDigit = -1;
        if ((c >= '0') && (c <= '9')) {
            curDigit = c - '0';
        } else if ((c >= 'a') && (c <= 'z')) {
            curDigit = (c - 'a') + 10;
        }

        if ((curDigit >= 0) && (curDigit < base)) {
            ret = (ret * base) + curDigit;
            if (!ret) {
                zeroes++;
            }
            if ((++digits - zeroes) > maxDigits) {
                return lValExceptionReaderEnd(s, start, "Literal too big, loss of precision imminent");
            }
        } else {
            if (!isnumericseparator(c)) {
                return lValExceptionReaderEnd(s, start, "Wrong char in literal");
            }
        }
    }

    if (leadingZeroes != NULL) {
        *leadingZeroes = zeroes;
    }
    return lValInt(ret);
}

static lVal lParseNumber(lReadContext *s, int base, int maxDigits) {
    const char *start = s->data;
    bool negative = false;
    if (*start == '-') {
        s->data++;
        negative = true;
    }
    lVal val = lParseNumberBase(s, NULL, base, maxDigits);
    if (unlikely(val.type == ltException)) {
        return val;
    }
    if (*s->data == '.') {
        s->data++;
        int mantissaLeadingZeroes = 0;
        const lVal mantissaVal = lParseNumberBase(s, &mantissaLeadingZeroes, base, maxDigits);
        if (unlikely(mantissaVal.type == ltException)) {
            return mantissaVal;
        }
        if (*s->data == '.') {
            return lValExceptionReaderEnd(s, start, "Period at end of number");
        } else {
            const double valf = createFloat(val.vInt, mantissaVal.vInt, mantissaLeadingZeroes);
            return lValFloat(negative ? -valf : valf);
        }
    }
    if (negative) {
        val.vInt = -val.vInt;
    }
    return val;
}

static lVal lParseCharacter(lReadContext *s) {
    int ret = s->data[0];
    if ((s->data[0] == 'B') && (s->data[1] == 'a')) {
        ret = '\b';
    } else if ((s->data[0] == 'T') && (s->data[1] == 'a')) {
        ret = '\t';
    } else if ((s->data[0] == 'L') && (s->data[1] == 'i')) {
        ret = '\n';
    } else if ((s->data[0] == 'R') && (s->data[1] == 'e')) {
        ret = '\r';
    } else if ((s->data[0] == 'l') && (s->data[1] == 'f')) {
        ret = '\n';
    } else if ((s->data[0] == 'c') && (s->data[1] == 'r')) {
        ret = '\r';
    }
    s->data++;
    lStringAdvanceToNextSpaceOrSpecial(s);
    return lValInt(ret);
}

static lVal lValExceptionBCRead(lReadContext *s, lVal v, const char *msg) {
    char buf[128];
    snprintf(buf, sizeof(buf), "invalid %s in Bytecoded Array", msg);
    buf[sizeof(buf) - 1] = 0;
    return lValException("read-error", buf, lCons(v, lValStringError(s->buf, s->bufEnd, s->data, s->data, s->data)));
}

static lVal lParseBytecodeArray(lReadContext *s) {
    u8 *d = NULL; // ToDo: make this buffer static / move to stack
    int size = 0;
    int len = 0;
    lArray *literals = NULL;

    lVal v = lReadValue(s);
    if (v.type != ltArray) {
        return lValExceptionBCRead(s, v, "Invalid literal array in BCA");
    }
    literals = v.vArray;
    literals->flags = ARRAY_IMMUTABLE;

    while (s->data < s->bufEnd) {
        if ((len + 4) >= size) {
            size = MAX(size, 128) * 2;
            u8 *newD = realloc(d, size);
            if (unlikely(newD == NULL)) {
                free(d);
                return lValException("out-of-memory", "OOM during BC Arr Parse", NIL);
            }
            d = newD;
        }
        lStringAdvanceToNextCharacter(s);
        char c = *s->data++;
        int t = 0;
        if ((c >= '0') && (c <= '9')) {
            t = (c - '0') << 4;
            goto readSecondNibble;
        }
        if ((c >= 'A') && (c <= 'F')) {
            t = ((c - 'A') + 0xA) << 4;
            goto readSecondNibble;
        }
        if ((c >= 'a') && (c <= 'f')) {
            t = ((c - 'a') + 0xA) << 4;
            goto readSecondNibble;
        }
        if (c == '}') {
            break;
        }

    readSecondNibble:
        if (s->data >= s->bufEnd) {
            free(d);
            return lValExceptionBCRead(s, NIL, "sudden end");
        }
        c = *s->data++;
        if ((c >= '0') && (c <= '9')) {
            t |= (c - '0');
            goto storeOP;
        }
        if ((c >= 'A') && (c <= 'F')) {
            t |= ((c - 'A') + 0xA);
            goto storeOP;
        }
        if ((c >= 'a') && (c <= 'f')) {
            t |= ((c - 'a') + 0xA);
            goto storeOP;
        }
        free(d);
        return lValException("read-error", "Wrong char in BCArr",
                             lValStringError(s->buf, s->bufEnd, s->data, s->data, s->data + 1));

    storeOP:
        d[len++] = (u8)t;
    }
    lVal ret = lValBytecodeArray(d, len, literals);
    free(d);
    return ret;
}

static lVal lParseBuffer(lReadContext *s) {
    u8 *buf = NULL;
    size_t len = 0;
    size_t bufSize = 0;
    while (s->data < s->bufEnd) {
        u8 curByte = 0;
        u8 c = *s->data;
        if (isspace(c) || isnonsymbol(c)) {
            break;
        }
        if (c < '0') {
            free(buf);
            return lValExceptionReaderStartEnd(s, "Wrong char in buffer lit.");
        }
        if (c <= '9') {
            curByte = (c - '0') << 4;
        } else {
            if ((c < 'A') || (c > 'F')) {
                free(buf);
                return lValExceptionReaderStartEnd(s, "Wrong char in buffer lit.");
            }
            curByte = ((c - 'A') + 0xA) << 4;
        }

        s->data++;
        if (s->data >= s->bufEnd) {
            free(buf);
            return lValExceptionReaderStartEnd(s, "Unexpected end of buffer");
        }
        c = *s->data++;
        if (isspace(c)) {
            free(buf);
            return lValExceptionReaderStartEnd(s, "Unexpected end of literal");
        }
        if (c < '0') {
            return lValExceptionReaderStartEnd(s, "Wrong char in buffer lit.");
        }
        if (c <= '9') {
            curByte |= (c - '0');
        } else {
            if ((c < 'A') || (c > 'F')) {
                return lValExceptionReaderStartEnd(s, "Wrong char in buffer lit.");
            }
            curByte |= ((c - 'A') + 0xA);
        }

        if (len >= bufSize) {
            bufSize = MAX(bufSize * 2, 256);
            u8 *newBuf = realloc(buf, bufSize);
            if (unlikely(newBuf == NULL)) {
                free(buf);
                return lValException("out-of-memory", "OOM during buffer parse", NIL);
            }
            buf = newBuf;
        }
        buf[len++] = curByte;
    }
    lStringAdvanceToNextCharacter(s);

    u8 *newBuf = realloc(buf, len);
    if (unlikely(newBuf == NULL)) {
        free(buf);
        return lValException("out-of-memory", "OOM during buffer parse outtro", NIL);
    }
    lVal ret = lValAlloc(ltBuffer, lBufferAlloc(len, true));
    ret.vBuffer->buf = newBuf;
    return ret;
}

static lVal lParseSpecial(lReadContext *s) {
    if (s->data >= s->bufEnd) {
        return NIL;
    }
    switch (*s->data++) {
    default:
        return lValExceptionReaderStartEnd(s, "Wrong char in special lit.");
    case '|': // SRFI-30
        lStringAdvanceUntilEndOfBlockComment(s);
        lStringAdvanceToNextCharacter(s);
        return lValComment();
    case '!': // Ignore Shebang's
        lStringAdvanceToNextLine(s);
        return lValComment();
    case '\\':
        return lParseCharacter(s);
    case 'm':
        return lParseBuffer(s);
    case 'x':
        return lParseNumber(s, 16, 16);
    case 'd':
        return lParseNumber(s, 10, 18);
    case 'o':
        return lParseNumber(s, 8, 21);
    case 'b':
        return lParseNumber(s, 2, 64);
    case 'n':
        lStringAdvanceToNextSpaceOrSpecial(s);
        return NIL;
    case 't':
        return lValBool(true);
    case 'f':
        return lValBool(false);
    case '{':
        return lParseBytecodeArray(s);
    case '#':
        s->data++;
        return lnfArrNew(s->c, lReadList(s, false, ')'));
    case '@': {
        s->data++;
        lVal ret = lnfTreeNew(s->c, lReadList(s, false, ')'));
        if (ret.vTree->root) {
            ret.vTree->root->flags |= TREE_IMMUTABLE;
        }
        return ret;
    }
    }
}

static lVal lReadList(lReadContext *s, bool rootForm, char terminator) {
    lVal v = NIL, ret = NIL;
    while (1) {
        lStringAdvanceToNextCharacter(s);

        const char c = *s->data;
        if ((s->data >= s->bufEnd) || (c == 0)) {
            if (!rootForm) {
                return lValExceptionReaderCustom(s, "Unmatched opening bracket", "unmatched-opening-bracket");
            }
            s->data++;
            return ret.type == ltNil ? lCons(NIL, NIL) : ret;
        } else if (c == ';') {
            lStringAdvanceToNextLine(s);
            continue;
        } else if (c == terminator) {
            if (rootForm) {
                return lValExceptionReader(s, "Unmatched closing bracket");
            }
            s->data++;
            return ret.type == ltNil ? lCons(NIL, NIL) : ret;
        } else if (isClosingChar(c)) {
            return lValExceptionReader(s, "Unmatched closing char");
        } else {
            const u8 next = s->data[1];
            if ((c == '.') && (isspace(next) || isnonsymbol(next))) {
                if (unlikely(v.type == ltNil)) {
                    return lValExceptionReader(s, "Missing car in dotted pair");
                }
                s->data++;
                lVal nv;
                do {
                    if (unlikely((s->data >= s->bufEnd) || (*s->data == 0) || (*s->data == ')'))) {
                        return lValExceptionReader(s, "Missing cdr in dotted pair");
                    }
                    lStringAdvanceToNextCharacter(s);
                    nv = lReadValue(s);
                    if (unlikely(nv.type == ltException)) {
                        return nv;
                    }
                } while (isComment(nv));
                v.vList->cdr = isComment(nv) ? NIL : nv;
                continue;
            } else {
                lVal nv = lReadValue(s);
                if (unlikely(isComment(nv))) {
                    continue;
                }
                if (unlikely(nv.type == ltException)) {
                    return nv;
                }
                if (v.type == ltNil) {
                    v = ret = lCons(nv, NIL);
                } else {
                    v.vList->cdr = lCons(nv, NIL);
                    v = v.vList->cdr;
                }
            }
        }
    }
}

static lVal lReadQuote(lReadContext *s, lSymbol *carSym) { return lCons(lValSymS(carSym), lCons(lReadValue(s), NIL)); }

static lVal lReadValue(lReadContext *s) {
    if (s->data >= s->bufEnd) {
        return NIL;
    }
    const char c = *s->data;

    switch (c) {
    case 0:
        return NIL;
    case '(':
        s->data++;
        return lReadList(s, false, ')');
    case '[':
        s->data++;
        return lCons(lValSymS(symArr), lReadList(s, false, ']'));
    case '{':
        s->data++;
        return lCons(lValSymS(symTreeNew), lReadList(s, false, '}'));
    case '~':
        s->data++;
        if (*s->data == '@') {
            s->data++;
            return lReadQuote(s, symUnquoteSplicing);
        }
        return lReadQuote(s, symUnquote);
    case '`':
        s->data++;
        return lReadQuote(s, symQuasiquote);
    case '\'':
        s->data++;
        return lReadQuote(s, symQuote);
    case '"':
        s->data++;
        return lParseString(s);
    case '#':
        s->data++;
        if (s->data >= s->bufEnd) {
            return NIL;
        } else if (*s->data == ';') {
            ++s->data;
            if ((s->data < s->bufEnd) && (*s->data == '(')) {
                s->data++;
                lReadList(s, false, ')');
                return lValComment();
            } else {
                lReadValue(s);
                return lValComment();
            }
        }
        return lParseSpecial(s);
    case ';':
        lStringAdvanceToNextLine(s);
        return lReadValue(s);
    default:
        if ((isdigit((u8)c)) || ((c == '-') && isdigit(s->data[1]))) {
            return lParseNumber(s, 10, 18);
        }
        return lParseSymbol(s);
    }
}

lVal lRead(lClosure *c, const char *str) {
    lReadContext ctx;
    ctx.c = c;
    ctx.buf = ctx.data = str;
    ctx.bufEnd = &str[strlen(str)];
    return lReadList(&ctx, true, ')');
}
Ben / Nujel