Brains, Programming Languages, Disconnected Enthusiasms

Thursday, September 06, 2007

Parsing s-expressions with Parsnip

Moving to New York has been quite hectic, but I have found a few spare moments to keep working on Parsnip. The library is still patchy and probably full of undiscovered bugs, but nonetheless quite powerful.

As a demonstration of Parsnip's capabilities, I used the library to build a small s-expression parser.

First, we need some data structures to represent s-expressions in C++:



//All s-expression objects (nil, cons, number, etc...) derive from LispObject

struct LispObject
{
virtual bool isCons() { return false; }
virtual bool isNumber() { return false; }
virtual bool isSymbol() { return false; }
virtual bool isNil() { return false; }

virtual std::string toString()=0;
};

typedef ptr<LispObject> ObjPtr;


/*
Cons cells are the fundamental element of s-expression lists.
Each cell contains a head (which is a data item, in the case of a flat list)
and a tail (which is the next cons cell in the list).
The last cell has a tail which points to nil (like a null-terminated string).
*/


struct Cons : public LispObject
{
Cons(ObjPtr _head, ObjPtr _tail) : head(_head), tail(_tail) {}
virtual bool isCons() { return true; }
virtual std::string toString()
{
return "(" + head->toString() + " . " + tail->toString() + ")";
}

ObjPtr head;
ObjPtr tail;
};
typedef ptr<Cons> ConsPtr;

ObjPtr makeCons(ObjPtr head, ObjPtr tail)
{
return new Cons(head, tail);
}

Cons* toCons(ObjPtr obj)
{
return static_cast<Cons*>(obj.GetRawPointer());
}


// Symbols are any variable/identifier


struct Symbol : public LispObject
{
Symbol(const std::string& _name) : name(_name) {}
virtual bool isSymbol() { return true; }
std::string toString() { return name; }

std::string name;
};
typedef ptr<Symbol> SymPtr;

ObjPtr makeSymbol(const std::string& str)
{
return new Symbol(str);
}


// Number objects are boxed (or wrapped) floats

struct Number : public LispObject
{
Number(float _val) : value(_val) {}
virtual bool isNumber() { return true; }
std::string toString() { return to_string(value); }

float value;
};

typedef ptr<Number> NumPtr;

ObjPtr makeNumber(float val)
{
return new Number(val);
}


// Nil is the terminating element at the end of each list


struct NilObject : public LispObject
{
bool isNil() { return true; }

static ObjPtr getNil()
{
if (nil) { return nil; }
else
{
nil = new NilObject;
return nil;
}
}
std::string toString() { return "()"; }

private:
static ObjPtr nil;
};
ObjPtr NilObject::nil;

ObjPtr getNil()
{
return NilObject::getNil();
}


/*
The BuildCons accumulator class is a template parameter
to the many/many1 parser combinators.
Each ObjPtr accepted by the many/many1 parser is passed
to an instance of BuildCons.
A resulting cons-list can be extracted using the result() method.
*/


struct BuildCons : public Accumulator
{
BuildCons()
{
first_cell = getNil();
last_cell = first_cell;
}

virtual void accum(const ObjPtr& o)
{
static ObjPtr nil = getNil();
if(first_cell->isCons())
{
ObjPtr new_cell = makeCons(o, nil);
toCons(last_cell)->tail = new_cell;
last_cell = new_cell;
}
else
{
first_cell = makeCons(o, nil);
last_cell = first_cell;
}
}
virtual ObjPtr result() { return first_cell; }

private:
ObjPtr first_cell;
ObjPtr last_cell;
};


Next, we construct the parser itself:



// An ObjParser is any parser which accepts an input std::string and returns an ObjPtr.


typedef Parser<string, ObjPtr>::type ObjParser;
ObjParser lisp_symbol = call1(makeSymbol, many1(letters | oneOf("+-/*=@!?&^~:<>%")));


/*
The 'real' parser accepts strings such as "3.42" and "200".
The utility function makeNumber accepts a string and converts
it to a float.
The function 'call1' creates a parser which takes the result of
its second argument applies to that its first argument. In short,
it calls makeNumber.
*/

ObjParser lisp_number = call1(makeNumber, real);

typedef Parser<string, void>::type VoidParser;

VoidParser LP = skip(token_ch('('));
VoidParser RP = skip(token_ch(')'));

ObjParser lisp_nil = call0(getNil, LP >> RP);
ObjParser lisp_atom = lisp_symbol | lisp_number | lisp_nil;


/*
The lisp_cons parser can't refer to itself directly,
so we have to use a proxy LazyParser called cons_self.
*/


ObjParser cons_self = lazy<string, ObjPtr>();
ObjParser lisp_cons = LP >> many<BuildCons>(token(lisp_atom | cons_self)) >> RP;
ObjParser lisp_expr = lisp_atom | lisp_cons;

setLazy(cons_self, lisp_cons);


Lastly, we add an input loop to test how the expressions get parsed:


std::string input;

while (true)
{
std::cout << "> ";
std::getline(std::cin, input);
Result<ObjPtr> result = parse(input, lisp_expr);
if (input == "exit") break;
if (parse_finished(result))
{
cout < result.data()->toString() << endl;
}
else if (input_consumed(result))
{
cout << "Unexpected end of input string " << endl;
}
else
{
int pos = parse_position(result);
cout << "Unexpected character '" << input[pos] << "' at position " << pos << endl;
}
}


And that's it! We're now half way to writing an s-expression based language (ie, a mini-lisp or a configuration DSL).

No comments: