Friday, March 27, 2009

Implementing a SQL Parser using JavaCC - 2

Lets start with a JavaCC Grammar file for parsing the SQL queries. JavaCC grammar file for SQL Parsing is divided into various sections and they are as given below:
1. Options section
This is the first and foremost section of the JavaCC grammar file. This section contains various JavaCC configuration options.
Eg:
options {
IGNORE_CASE = true;
STATIC = false;
UNICODE_INPUT = true;

// some performance optimizations
ERROR_REPORTING = false;
}
2. The next section is the block that is surrounded by PARSER_BEGIN(parserName) and PARSER_END(parserName). In this block, a java compilation unit can be inserted. In the simplest case, it can be a very simple class declaration without any method or variable inside it. The package declaration used in the section will be used in all the JavaCC generated files like parser, token manager, token manager constants, char stream, etc.
 PARSER_BEGIN(BlitzSQLParser)
package net.java.blitz.db.sql.parser;
import java.util.List;
import java.util.LinkedList;
import net.java.blitz.db.sql.expression.*;
/**
* @author karthikeyan subramanian
* Do not edit this file directly. This file is generated from BlitzSQLGrammar.jj
*
*/
public class BlitzSQLParser{
}
PARSER_END(BlitzSQLParser)
3. This section is then followed by the set of regular expression production that define how the parser should behave when a match is found. Four special regular expressions are available for this purpose. Now we define our regular expressions here. (Disclosure: Lot of SQL related tokens are borrowed from AxionSQLParser from Axion.) Not all these tokens are used in this SQL Parser. They can be useful for future use.
// ----------------------------------------------------------------------------
// TOKENS
// ----------------------------------------------------------------------------
SKIP:{
" "
| "\n"
| "\r"
| "\t"
}
SKIP:{
<LINE_COMMENT:"--"(~["\n", "\r"])*("\n"
| "\r"
| "\r\n")>
}
SKIP:{
<BLOCK_COMMENT:"/*"(~["*"])*"*"("*"
| (~["*", "/"](~["*"])*"*"))*"/">
}
TOKEN:// KEYWORDS
{
<ADD:"add">
| <ALL:"all">
| <ALTER:"alter">
| <ALWAYS:"always">
| <AND:"and">
| <AS:"as">
| <ASC:"asc">
| <BEGIN:"begin">
| <BETWEEN:"between">
| <BOTH:"both">
| <BY:"by">
| <CAST:"cast">
| <CASCADE:"cascade">
| <CASE:"case">
| <CHECK:"check">
| <CREATE:"create">
| <COLUMN:"column">
| <CONSTRAINT:"constraint">
| <CURRENT_TIMESTAMP:"current_timestamp">
| <CURRENT_DATE:"current_date">
| <CURRENT_TIME:"current_time">
| <CYCLE:"cycle">
| <DATABASE:"database">
| <DATA:"data">
| <DAY:"day">
| <DEFAULT_:"default">
| <DEFERRED:"deferred">
| <DEFERRABLE:"deferrable">
| <DEFRAG:"defrag">
| <DELETE:"delete">
| <DESC:"desc">
| <DISTINCT:"distinct">
| <DROP:"drop">
| <ELSE:"else">
| <END:"end">
| <ESCAPE:"escape">
| <EXCEPTION:"exception">
| <EXISTS:"exists">
| <EXPLAIN:"explain">
| <EXTRACT:"extract">
| <EXTERNAL:"external">
| <FALSE:"false">
| <FIRST:"first">
| <FOR:"for">
| <FOREIGN:"foreign">
| <FROM:"from">
| <FULL:"full">
| <GENERATED:"generated">
| <GROUP:"group">
| <HAVING:"having">
| <HOUR:"hour">
| <IDENTITY:"identity">
| <IF:"if">
| <INCREMENT:"increment">
| <IMMEDIATE:"immediate">
| <IN:"in">
| <INITIALLY:"initially">
| <INDEX:"index">
| <INNER:"inner">
| <INSERT:"insert">
| <INTO:"into">
| <IS:"is">
| <JOIN:"join">
| <KEY:"key">
| <LEADING:"leading">
| <LEFT:"left">
| <LIKE:"like">
| <LIMIT:"limit">
| <LINK:"link">
| <MAXVALUE:"maxvalue">
| <MATCHED:"matched">
| <MERGE:"merge">
| <MINUTE:"minute">
| <MINVALUE:"minvalue">
| <MILLISECOND:"millisecond">
| <MONTH:"month">
| <NEXT:"next">
| <NO:"no">
| <NOT:"not">
| <NULL:"null">
| <OFFSET:"offset">
| <ON:"on">
| <OR:"or">
| <ORDER:"order">
| <ORGANIZATION:"organization">
| <OUTER:"outer">
| <POSITION:"position">
| <PRIMARY:"primary">
| <QUARTER:"quarter">
| <RIGHT:"right">
| <REFERENCES:"references">
| <RENAME:"rename">
| <RESTART:"restart">
| <SECOND:"second">
| <SELECT:"select">
| <SEQUENCE:"sequence">
| <SET:"set">
| <SOUNDS:"sounds">
| <START:"start">
| <SUBSTRING:"substring">
| <SYSDATE:"sysdate">
| <TABLE:"table">
| <THEN:"then">
| <TO:"to">
| <TRAILING:"trailing">
| <TRIM:"trim">
| <TRUE:"true">
| <TRUNCATE:"truncate">
| <UNIQUE:"unique">
| <UPDATE:"update">
| <UPSERT:"upsert">
| <USER:"user">
| <USING:"using">
| <VALUES:"values">
| <VALUE:"value">
| <VIEW:"view">
| <WEEK:"week">
| <WHEN:"when">
| <WHERE:"where">
| <WITH:"with">
| <YEAR:"year">
}
TOKEN:// DATA TYPES
{
<BIT:"bit">
| <BYTE:"byte">
| <INT:"int">
| <REAL:"real">
| <CLOB:"clob">
| <BLOB:"blob">
| <CHAR:"char">
| <CHARACTER:"character">
| <DATE:"date">
| <TIME:"time">
| <FLOAT:"float">
| <BIGINT:"bigint">
| <LONG:"long">
| <RAW:"raw">
| <STRING:"string">
| <BINARY:"binary">
| <NUMERIC:"numeric">
| <NUMBER:"number">
| <DECIMAL:"decimal">
| <DEC:"dec">
| <BOOLEAN:"boolean">
| <TINYINT:"tinyint">
| <INTEGER:"integer">
| <VARCHAR:"varchar">
| <VARCHAR2:"varchar2">
| <LONGVARCHAR:"longvarchar">
| <TEXT:"text">
| <SMALLINT:"smallint">
| <SHORT:"short">
| <VARBINARY:"varbinary">
| <LONGVARBINARY:"longvarbinary">
| <IMAGE:"image">
| <VARYING:"varying">
| <LARGE:"large">
| <TIMESTAMP:"timestamp">
| <OBJECT:"object">
| <JAVA_OBJECT:"java_object">
| <DOUBLE:"double">
}
TOKEN:// LITERALS
{
<INTEGER_LITERAL:(["0"-"9"])+>
| <FLOATING_POINT_LITERAL:(["0"-"9"])+"."(["0"-"9"])+(<EXPONENT>)?
| "."(["0"-"9"])+(<EXPONENT>)?
| (["0"-"9"])+<EXPONENT>
| (["0"-"9"])+(<EXPONENT>)?>
| <#EXPONENT:["e", "E"](["+", "-"])?(["0"-"9"])+>
| <STRING_LITERAL:"'"(~["'"])*("''"(~["'"])*)*"'">
}
TOKEN:// IDENTIFIERS
{
<ID:(<LETTER>)+("_"
| "$"
| "#"
| <DIGIT>
| <LETTER>)*>
| <#LETTER:["A"-"Z", "a"-"z"]>
| <#DIGIT:["0"-"9"]>
}
TOKEN:// SEPARATORS AND OPERATORS
{
<ASSIGN:":=">
| <COMMA:",">
| <CONCAT:"||">
| <SEMICOLON:";">
| <DOT:".">
| <LESS:"<">
| <LESSEQUAL:"<=">
| <GREATER:">">
| <GREATEREQUAL:">=">
| <EQUAL:"=">
| <NOTEQUAL:"!=">
| <NOTEQUAL2:"<>">
| <JOINPLUS:"(+)">
| <OPENPAREN:"(">
| <CLOSEPAREN:")">
| <ASTERISK:"*">
| <SLASH:"/">
| <PLUS:"+">
| <MINUS:"-">
| <QUESTIONMARK:"?">
}
TOKEN:// START QUOTED IDENTIFIER
{
<START_QUOTED_IDENTIFIER:"\"">:STATE_QuotedIdentStart
}
<STATE_QuotedIdentStart>TOKEN:{
<QUOTED_IDENTIFIER:<ID>>:STATE_QuotedIdentEnd
}
<STATE_QuotedIdentEnd>TOKEN:// IDENTIFIER ESCAPE CHAR
{
<END_QUOTED_IDENTIFIER:"\"">:DEFAULT
}
The SKIP regular expression production specifies that the tokens in the input that match this regular expressions should be skipped or ignored whereas TOKEN specifies that the matching tokens are to be considered as a valid TOKEN and should be processed.

(To be continued)...

2 comments:

  1. FYI, "OPTIMIZE_TOKEN_MANAGER" is no longer a valid JavaCC option... it was removed in JavaCC 4.1.

    ReplyDelete
  2. Thanks for correcting me. I will correct that.

    ReplyDelete