Recently I finished the
coursera compiler course where we wrote a fully functional
COOL language compiler as part of course work(
some related posts).
Because the course encouraged and also the time pressure was there, at the time I had used
jlex lexical analyzer generator to generate the lexer. It was a good learning to describe your token specification completely in terms of regexes and let the generator do its magic.
But, often times, I have read/heard that most of the production compilers don't use any lexer generators and lexers are hand coded. So, I also wanted to hand code the COOL lexer. Since, having written the COOL lexer spec for jlex, I understood the details already and it took a bit more than a day to handcode it.
Here it is and I guess it is simple enough to follow for anyone to understand(Also, lexical structure of COOL can be found in section-10 in the
manual)...
package info.himanshug.coolc;
import info.himanshug.coolc.provided.AbstractTable;
import info.himanshug.coolc.provided.TokenConstants;
import java.io.BufferedReader;
import java.io.FileReader;
import java.io.IOException;
import java.util.HashMap;
import java.util.Map;
public class HandCodedCoolLexer {
private String fileName; //source code file name
private BufferedReader stream; //stream of characters
private int currLineNo = 1;
//all COOL keywords and their token constant map
private Map keywordsTokenConstants = new HashMap();
public HandCodedCoolLexer(FileReader stream, String fileName) {
this.stream = new BufferedReader(stream);
this.fileName = fileName;
keywordsTokenConstants.put("class", TokenConstants.CLASS);
keywordsTokenConstants.put("else",TokenConstants.ELSE);
keywordsTokenConstants.put("fi",TokenConstants.FI);
keywordsTokenConstants.put("if",TokenConstants.IF);
keywordsTokenConstants.put("in",TokenConstants.IN);
keywordsTokenConstants.put("inherits",TokenConstants.INHERITS);
keywordsTokenConstants.put("isvoid",TokenConstants.ISVOID);
keywordsTokenConstants.put("let",TokenConstants.LET);
keywordsTokenConstants.put("loop",TokenConstants.LOOP);
keywordsTokenConstants.put("pool",TokenConstants.POOL);
keywordsTokenConstants.put("then",TokenConstants.THEN);
keywordsTokenConstants.put("while",TokenConstants.WHILE);
keywordsTokenConstants.put("case",TokenConstants.CASE);
keywordsTokenConstants.put("esac",TokenConstants.ESAC);
keywordsTokenConstants.put("new",TokenConstants.NEW);
keywordsTokenConstants.put("of",TokenConstants.OF);
keywordsTokenConstants.put("not",TokenConstants.NOT);
}
private int ch; //last read char
private StringBuilder sb = new StringBuilder(); //buffer for look ahead characters
//Main work-horse called by client code to get COOL
//token
public Symbol getNextToken() throws IOException {
//get net char, skip the whitespace characters
for(ch = getNextChar(); isWhitespaceChar(ch); ch = getNextChar()) {
if(ch == '\n')
currLineNo++;
}
if(isEOF(ch))
return new Symbol(TokenConstants.EOF);
if(isDigit(ch)) {
return getIntegerToken();
} else if(isLetter(ch)) {
return getIdentifierOrKeywordToken();
} else if(ch == '"') {
return getStringToken();
} else if(ch == '-') {
int tmp = getNextChar();
if(tmp == '-') {
skipLineComment();
return getNextToken(); //TODO: should use while loop, no tail-recursion
} else {
appendToLookaheadBuffer(tmp);
return new Symbol(TokenConstants.MINUS);
}
} else if(ch == '(') {
int tmp = getNextChar();
if(tmp == '*') {
Symbol err = skipBlockComment();
if(err != null)
return err;
else
return getNextToken();
} else {
appendToLookaheadBuffer(tmp);
return new Symbol(TokenConstants.LPAREN);
}
} else if(ch == '<') {
int tmp = getNextChar();
if(tmp == '=') {
return new Symbol(TokenConstants.LE);
} else if(tmp == '-') {
return new Symbol(TokenConstants.ASSIGN);
} else {
appendToLookaheadBuffer(tmp);
return new Symbol(TokenConstants.LT);
}
} else if(ch == '=') {
int tmp = getNextChar();
if(tmp == '>') {
return new Symbol(TokenConstants.DARROW);
} else {
appendToLookaheadBuffer(tmp);
return new Symbol(TokenConstants.EQ);
}
} else if(ch == '*') {
int tmp = getNextChar();
if(tmp == ')') {
return new Symbol(TokenConstants.ERROR, "Unmatched *)");
} else {
appendToLookaheadBuffer(tmp);
return new Symbol(TokenConstants.MULT);
}
} else if(ch == ';') {
return new Symbol(TokenConstants.SEMI);
} else if(ch == ')') {
return new Symbol(TokenConstants.RPAREN);
} else if(ch == ',') {
return new Symbol(TokenConstants.COMMA);
} else if(ch == '/') {
return new Symbol(TokenConstants.DIV);
} else if(ch == '+') {
return new Symbol(TokenConstants.PLUS);
} else if(ch == '.') {
return new Symbol(TokenConstants.DOT);
} else if(ch == ':') {
return new Symbol(TokenConstants.COLON);
} else if(ch == '~') {
return new Symbol(TokenConstants.NEG);
} else if(ch == '{') {
return new Symbol(TokenConstants.LBRACE);
} else if(ch == '}') {
return new Symbol(TokenConstants.RBRACE);
} else if(ch == '@') {
return new Symbol(TokenConstants.AT);
} else {
return new Symbol(TokenConstants.ERROR, Character.toString((char)ch));
}
}
private Symbol getIntegerToken() throws IOException {
StringBuilder buff = new StringBuilder();
buff.append((char)ch);
ch = getNextChar();
while(isDigit(ch)) {
buff.append((char)ch);
ch = getNextChar();
}
appendToLookaheadBuffer(ch);
return new Symbol(TokenConstants.INT_CONST,
AbstractTable.inttable.addString(buff.toString()));
}
private Symbol getIdentifierOrKeywordToken() throws IOException {
StringBuilder buff = new StringBuilder();
buff.append((char)ch);
ch = getNextChar();
while(isLetter(ch) || isDigit(ch) || ch == '_') {
buff.append((char)ch);
ch = getNextChar();
}
appendToLookaheadBuffer(ch);
String lexeme = buff.toString();
//first see if lexeme is a keyword
//they are case-insensitive
String s = lexeme.toLowerCase();
if(keywordsTokenConstants.containsKey(s))
return new Symbol(keywordsTokenConstants.get(s));
//see if it is true/false
//first char has to be lower case, rest is case insensitive
if(buff.charAt(0) == 't' && buff.length() == 4 && "rue".equalsIgnoreCase(buff.substring(1))) {
return new Symbol(TokenConstants.BOOL_CONST,Boolean.valueOf(true));
} else if(buff.charAt(0) == 'f' && buff.length() == 5 && "alse".equalsIgnoreCase(buff.substring(1))) {
return new Symbol(TokenConstants.BOOL_CONST,Boolean.valueOf(false));
}
//otherwise its a Typeid or Objectid depending upon the case of
//first char
if(Character.isUpperCase(buff.charAt(0))) {
//Typeid
return new Symbol(TokenConstants.TYPEID,AbstractTable.idtable.addString(lexeme));
} else {
return new Symbol(TokenConstants.OBJECTID,AbstractTable.idtable.addString(lexeme));
}
}
private Symbol getStringToken() throws IOException {
int maxLength = 1024;
StringBuilder buff = new StringBuilder();
boolean escaped = false;
boolean containsNull = false;
boolean tooLong = false;
while(true) {
ch = getNextChar();
if(isEOF(ch))
return new Symbol(TokenConstants.ERROR,"EOF in string constant");
if(escaped) {
if(ch == 'b')
buff.append('\b');
else if(ch == 't')
buff.append('\t');
else if(ch == 'n')
buff.append('\n');
else if(ch == 'f')
buff.append('\f');
else if(ch == '\0')
containsNull = true;
else {
buff.append((char)ch);
}
if(ch == '\n')
currLineNo++;
escaped = false;
} else {
if(ch == '\\')
escaped = true;
else if(ch == '\n') {
currLineNo++;
return new Symbol(TokenConstants.ERROR,"Unterminated string constant");
} else if(ch == '\0')
containsNull = true;
else if(ch == '"') {
//terminate
break;
} else {
buff.append((char)ch);
}
}
if(buff.length() > maxLength) {
tooLong = true;
buff.setLength(0);
}
}
if(containsNull)
return new Symbol(TokenConstants.ERROR,"String contains null character.");
else if(tooLong)
return new Symbol(TokenConstants.ERROR,"String constant too long");
else
return new Symbol(TokenConstants.STR_CONST,AbstractTable.stringtable.addString(buff.toString()));
}
private void skipLineComment() throws IOException {
ch = getNextChar();
while(ch != '\n' && !isEOF(ch)) {
ch = getNextChar();
}
appendToLookaheadBuffer(ch);
}
//returns error token if comment is incomplete or else
//return null
private Symbol skipBlockComment() throws IOException {
int blockCommentOpenCount = 1;
while(blockCommentOpenCount > 0) {
ch = getNextChar();
if(ch == '(') {
int tmp = getNextChar();
if(tmp == '*')
blockCommentOpenCount++;
else {
appendToLookaheadBuffer(tmp);
}
} else if(ch == '*') {
int tmp = getNextChar();
if(tmp == ')')
blockCommentOpenCount--;
else {
appendToLookaheadBuffer(tmp);
}
} else if(ch == '\n') {
currLineNo++;
} else if(isEOF(ch)) {
return new Symbol(TokenConstants.ERROR,"EOF in comment");
}
}
return null;
}
//utilities to read data from stream or buffer if any lookaheads happened
private int getNextChar() throws IOException {
if(sb.length() > 0) {
char c = sb.charAt(0);
sb.deleteCharAt(0);
return c;
} else {
return stream.read();
}
}
private void appendToLookaheadBuffer(int ch) {
if(!isEOF(ch))
sb.append((char)ch);
}
private boolean isEOF(int c) {
return c < 0;
}
private boolean isDigit(int c) {
return Character.isDigit(c);
}
private boolean isLetter(int c) {
return Character.isUpperCase(c) || Character.isLowerCase(c);
}
private boolean isWhitespaceChar(int c) {
return c == 32 //space
|| c == 10 //newline
|| c == 12 //form feed
|| c == 13 //carriage return
|| c == 9 //tab
|| c == 11; //vertical tab
}
public int getCurrLineno() {
return currLineNo;
}
}