clamav/libclamav/jsparse/js-norm.c

/*
 *  Javascript normalizer.
 *
 *  Copyright (C) 2008 Sourcefire, Inc.
 *
 *  Authors: Török Edvin
 *
 *  This program is free software; you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License version 2 as
 *  published by the Free Software Foundation.
 *
 *  This program is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with this program; if not, write to the Free Software
 *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
 *  MA 02110-1301, USA.
 */

#include <stdio.h>
#include <unistd.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#include <ctype.h>
#include <assert.h>
#define BUFS 65536
#include "lexglobal.h"
#include "hashtab.h"
#include "others.h"
#include "js-norm.h"
#include "jsparse/generated/operators.h"
#include "jsparse/generated/keywords.h"

/* ----------- tokenizer ---------------- */
enum tokenizer_state {
	Initial,
	MultilineComment,
	SinglelineComment,
	Number,
	DoubleQString,
	SingleQString,
	Identifier
};

struct text_buffer {
	char *data;
	size_t pos;
	size_t capacity;
};

typedef struct scanner {
	enum tokenizer_state state;
	struct text_buffer buf;
	const char *yytext;
	size_t yylen;
	const char *in;
	size_t insize;
	size_t pos;
} *yyscan_t;

typedef int YY_BUFFER_STATE;

static int yylex( YYSTYPE *lvalp, yyscan_t  );
static void yy_delete_buffer( YY_BUFFER_STATE, yyscan_t);
static YY_BUFFER_STATE yy_scan_bytes( const char *, size_t, yyscan_t scanner );
static const char *yyget_text ( yyscan_t scanner );
static int yyget_leng ( yyscan_t scanner );
static int yylex_init ( yyscan_t * ptr_yy_globals ) ;
static void yyset_debug (int debug_flag ,yyscan_t yyscanner );
static int yylex_destroy ( yyscan_t yyscanner ) ;
/* ----------- tokenizer end ---------------- */

enum fsm_state {
	Base,
	InsideVar,
	InsideInitializer,
	WaitFunctionName,
	WaitParameterList,
	InsideFunctionDecl
};

struct scope {
	struct hashtable id_map;
	struct scope *parent;/* hierarchy */
	struct scope *nxt;/* all scopes kept in a list so we can easily free all of them */
	enum fsm_state fsm_state;
	int  last_token;
	unsigned int brackets;
	unsigned int blocks;
};

struct tokens {
	yystype *data;
	size_t   cnt;
	size_t   capacity;
};

/* state for the current JS file being parsed */
struct parser_state {
	unsigned long     var_uniq;
	unsigned long     syntax_errors;
	struct scope *global;
	struct scope *current;
	struct scope *list;
	yyscan_t scanner;
	struct tokens tokens;
};

static struct scope* scope_new(struct parser_state *state)
{
	struct scope *parent = state->current;
	struct scope *s = cli_calloc(1, sizeof(*s));
	if(!s)
		return NULL;
	if(hashtab_init(&s->id_map, 10) < 0) {
		free(s);
		return NULL;
	}
	s->parent = parent;
	s->fsm_state = Base;
	s->nxt = state->list;
	state->list = s;
	state->current = s;
	return s;
}

static struct scope* scope_done(struct scope *s)
{
	struct scope* parent = s->parent;
	/* TODO: have a hashtab_destroy */
	hashtab_clear(&s->id_map);
	free(s->id_map.htable);
	free(s);
	return parent;
}

/* transitions:
 *   Base --(VAR)--> InsideVar
 *   InsideVar --(Identifier)-->InsideInitializer
 *   InsideVar --(anything_else) --> POP (to Base)
 *   InsideInitializer --(COMMA)--> POP (to InsideVar)
 *   InsideInitializer | InsideVar --(SEMICOLON) --> POP (to Base)
 *   InsideInitializer --(BRACKET_OPEN) --> WaitBrClose
 *   InsideInitializer --(PAR_OPEN) --> WaitParClose
 *   WaitBrClose --(BRACKET_OPEN) --> increase depth
 *   WaitBrClose --(BRACKET_CLOSE) --> POP
 *   WaitParClose --(PAR_CLOSE) --> POP
 *   WaitParClose --(PAR_OPEN) --> increase depth
 */

/* Base --(VAR)--> PUSH, to InsideVar
 * InsideVar --(Identifier)--> InsideInitializer
 * InsideVar --(ELSE)--> POP, inc. syntax_errors
 * InsideInitializer --(COMMA)--> POP (to InsideVar)
 * --(BRACKET_OPEN)--> inc bracket_counter
 * --(PAR_OPEN)--> inc par_counter
 * --(BRACKET_CLOSE) --> dec bracket_counter
 * --(PAR_CLOSE)--> dec par_counter
 * --(VAR)--> PUSH, to InsideVar (if bracket_counter != 0 || par_counter != 0)
 *        --> POP, to InsideVar, inc. syntax_errors (if bracket_counter == 0  && par_counter == 0)
 *  POP only allowed if bracket_counter == 0 && par_counter == 0 
 *
 * InsideInitializer acts differently, make it only a flag
 * ....................
 *
 * Pushing, Poping is done when entering / exiting function scopes,
 * tracking { and function ( is done by the function scope tracker too.
 *
 * we only need to track brackets.
 */


/*
 * var x = document;
 * x.writeln(...);
 *
 * ^we must not normalize member method names
 */

/*
 * Variables are declared at function scope, and their initial value is
 * undefined. At the point where the initializer is, and from there on the value
 * is defined.
 *
 * { doesn't introduce a new variable scope, they are in function's scope too
 *
 * function foo() {
 *  alert(x); -> x exists, undefined
 *  var x=5; 
 *  alert(x); -> x exists, =5
 * }
 * 
 * vs.
 *
 * function bar() {
 *   alert(x);//error, x not declared
 *   x=5;
 *   }
 *
 * vs.
 *
 * but we can declare variables without var, only valid if we use them after
 * assigning.
 *
 * function foobar() {
 *   x=5;
 *   alert(x);//x is defined, value is 5
 *   }
 *
 * other examples:
 * function foo2() {
 *   alert(x); -> x exists, undefined
 *   {
 *       var x=5; -> x equals to 5
 *   }
 *   alert(x); -> x is 5
 * }
 *
 * function foo3() {
 *   var x=4; -> x exists, equals to 4
 *   alert(x); -> x exists, equals to 4
 *   {
 *       var x=5; -> x equals to 5
 *   }
 *   alert(x); -> x is 5
 * }
 *
 * function bar3() {
 *   //same as foo3
 *   var x=4;
 *   alert(x);
 *   { 
 *        x=5;
 *   }
 *   alert(x);
 * }
 *
 */


static const char* scope_declare(struct scope *s, const char *token, const size_t len, struct parser_state *state)
{
	const struct element *el = hashtab_insert(&s->id_map, token, len, state->var_uniq++);
	/* hashtab_insert either finds an already existing entry, or allocates a
	 * new one, we return the allocated string */
	return el ? el->key : NULL;
}

static const char* scope_use(struct scope *s, const char *token, const size_t len)
{
	const struct element *el = hashtab_find(&s->id_map, token, len);
	if(el) {
		/* identifier already found in current scope,
		 * return here to avoid overwriting uniq id */
		return el->key;
	}
	/* identifier not yet in current scope's hashtab, add with ID -1.
	 * Later if we find a declaration it will automatically assign a uniq ID
	 * to it. If not, we'll know that we have to push ID == -1 tokens to an
	 * outer scope.*/
	el = hashtab_insert(&s->id_map, token, len, -1);
	return el ? el->key : NULL;
}

static long scope_lookup(struct scope *s, const char *token, const size_t len)
{
	while(s) {
		const struct element *el = hashtab_find(&s->id_map, token, len);
		if(el && el->data != -1) {
			return el->data;
		}
		/* not found in current scope, try in outer scope */
		s = s->parent;
	}
	return -1;
}

static int tokens_ensure_capacity(struct tokens *tokens, size_t cap)
{
	if(tokens->capacity < cap) {
		tokens->capacity = cap + 1024;
		tokens->data = cli_realloc2(tokens->data, tokens->capacity * sizeof(*tokens->data));
		if(!tokens->data)
			return CL_EMEM;
	}
	return CL_SUCCESS;
}

static int add_token(struct parser_state *state, const yystype *token)
{
	if(tokens_ensure_capacity(&state->tokens, state->tokens.cnt + 1) == -1)
		return -1;
	state->tokens.data[state->tokens.cnt++] = *token;
	return 0;
}

struct buf {
	size_t pos;
	int outfd;
	char buf[65536];
};

static inline int buf_outc(char c, struct buf *buf)
{
	if(buf->pos >= sizeof(buf->buf)) {
		if(write(buf->outfd, buf->buf, sizeof(buf->buf)) != sizeof(buf->buf))
			return CL_EIO;
		buf->pos = 0;
	}
	buf->buf[buf->pos++] = c;
	return CL_SUCCESS;
}

static inline int buf_outs(const char *s, struct buf *buf)
{
	const size_t buf_len = sizeof(buf->buf);
	size_t len = strlen(s);
	while(buf->pos + len > buf_len) {
		memcpy(buf->buf + buf->pos, s, buf_len - buf->pos);
		len -= (buf_len - buf->pos);
		if(write(buf->outfd, buf->buf, buf_len) < 0)
			return CL_EIO;
		buf->pos = 0;
	}
	memcpy(buf->buf + buf->pos, s, len);
	buf->pos += len;
	return CL_SUCCESS;
}

static inline void output_space(char last, char current, struct buf *out)
{
	if(isalnum(last) && isalnum(current))
		buf_outc(' ', out);
}


/* return class of last character */
static char output_token(const yystype *token, struct scope *scope, struct buf *out, char lastchar)
{
	char sbuf[128];
	const char *s = TOKEN_GET(token, cstring);
	/* TODO: use a local buffer, instead of FILE* */
	switch(token->type) {
		case TOK_StringLiteral:
			output_space(lastchar,'"', out);
			buf_outc('"', out);
			if(s) {
				buf_outs(s, out);
			}
			buf_outc('"', out);
			return '\"';
		case TOK_NumericInt:
			output_space(lastchar,'0', out);
			snprintf(sbuf, sizeof(sbuf), "%ld", TOKEN_GET(token, ival));
			buf_outs(sbuf, out);
			return '0';
		case TOK_NumericFloat:
			output_space(lastchar,'0', out);
			snprintf(sbuf, sizeof(sbuf), "%e", TOKEN_GET(token, dval));
			buf_outs(sbuf, out);
			return '0';
		case TOK_IDENTIFIER_NAME:
			/* TODO: lookup identifier name here, and normalize it
			 * */
			output_space(lastchar,'a', out);
			if(s) {
				long id = scope_lookup(scope, s, strlen(s));
				if(id == -1) {
					/* identifier not normalized */
					buf_outs(s, out);
				} else {
					snprintf(sbuf, sizeof(sbuf), "n%03ld",id);
					buf_outs(sbuf, out);
				}
			}
			return 'a';
		case TOK_FUNCTION:
			/*TODO: output function name */
			output_space(lastchar,'a', out);
			buf_outs("function",out);
			return 'a';
		default:
			if(s) {
				const size_t len = strlen(s);
				output_space(lastchar,s[0], out);
				buf_outs(s, out);
				return len ? s[len-1] : '\0';
			}
			return '\0';
	}
}

/*
 * We can't delete the scope as soon as we see a }, because
 * we still need the hashmap from it.
 *
 * If we would normalize all the identifiers, and output when a scope is closed,
 * then it would be impossible to normalize calls to other functions.
 *
 * So we need to keep all scopes in memory, to do this instead of scope_done, we
 * simply just set current = current->parent when a scope is closed.
 * We keep a list of all scopes created in parser_state-> When we parsed
 * everything, we output everything, and then delete all scopes.
 *
 * We also need to know where to switch scopes on the second pass, so for
 * TOK_FUNCTION types we will use another pointer, that points to the scope
 * (added to yystype's union).
 *
 * We lookup the identifier in the scope (using scope_lookup, it looks in parent
 * scopes too), if ID is found then output (n%3d, Id),
 * otherwise output the identifier as is.
 *
 * To make  it easier to match sigs, we do a xfrm : 
 * 'function ID1 (..'. => 'n%3d = function (...'
 */

/*
 * we'll add all identifier to the scope's map
 * those that are not decl. will have initial ID -1
 * if we later see a decl for it in same scope, it'll automatically get a
 * correct ID.
 *
 * When parsing of local scope is done, we take any ID -1 identifiers,
 * and push them up one level (careful not to overwrite existing IDs).
 *
 * it would be nice if the tokens would contain a link to the entry in the
 * hashtab, a link that automatically gets updated when the element is moved
 * (pushed up). This would prevent subsequent lookups in the map,
 * when we want to output the tokens.
 * There is no easy way to do that, so we just do another lookup
 *
 */

/*
 * This actually works, redefining foo:
 * function foo() {
 *   var foo=5; alert(foo);
 * }
 * So we can't treat function names just as any other identifier?
 * We can, because you can no longer call foo, if you redefined it as a var.
 * So if we rename both foo-s with same name, it will have same behaviour.
 *
 * This means that a new scope should begin after function, and not after
 * function ... (.
 */

static void scope_free_all(struct scope *p)
{
	struct scope *nxt;
	do {
		nxt = p->nxt;
		scope_done(p);
		p = nxt;
	} while(p);
}

void cli_strtokenize(char *buffer, const char delim, const size_t token_count, const char **tokens);
static int match_parameters(const yystype *tokens, const char ** param_names, size_t count)
{
	size_t i,j=0;
	if(tokens[0].type != TOK_PAR_OPEN)
		return -1;
	i=1;
	while(count--) {
		const char *token_val = TOKEN_GET(&tokens[i], cstring);
		if(tokens[i].type != TOK_IDENTIFIER_NAME ||
		   !token_val ||
		   strcmp(token_val, param_names[j++]))
			return -1;
		++i;
		if((count && tokens[i].type != TOK_COMMA)
		   || (!count && tokens[i].type != TOK_PAR_CLOSE))
			return -1;
		++i;
	}
	return 0;
}

static const char *de_packer_3[] = {"p","a","c","k","e","r"};
static const char *de_packer_2[] = {"p","a","c","k","e","d"};


#ifndef MAX
#define MAX(a, b) ((a)>(b) ? (a) : (b))
#endif

static inline char *textbuffer_done(yyscan_t scanner)
{
	/* free unusued memory */
	char *str = cli_realloc(scanner->buf.data, scanner->buf.pos);
	if(!str) {
		str = scanner->buf.data;
	}
	scanner->yytext = scanner->buf.data;
	scanner->yylen = scanner->buf.pos - 1;
	memset(&scanner->buf, 0, sizeof(scanner->buf));
	return str;
}

static inline int textbuffer_ensure_capacity(struct text_buffer *txtbuf, size_t len)
{
	if (txtbuf->pos + len > txtbuf->capacity) {
		char *d;
		txtbuf->capacity = MAX(txtbuf->pos + len, txtbuf->capacity + 4096);
		d = cli_realloc(txtbuf->data, txtbuf->capacity);
		if(!d)
			return -1;
		txtbuf->data = d;
	}
	return 0;
}

static inline void textbuffer_append_len(struct text_buffer *txtbuf, const char *s, size_t len)
{
	textbuffer_ensure_capacity(txtbuf, len);
	memcpy(&txtbuf->data[txtbuf->pos], s, len);
	txtbuf->pos += len;
}


static inline void textbuffer_append(struct text_buffer *txtbuf, const char *s)
{
	size_t len = strlen(s);
	textbuffer_append_len(txtbuf, s, len);
}

static inline void textbuffer_putc(struct text_buffer *txtbuf, const char c)
{
	textbuffer_ensure_capacity(txtbuf, 1);
	txtbuf->data[txtbuf->pos++] = c;
}
#define MODULE "JS-Norm: "

static void free_token(yystype *token)
{
	if(token->vtype == vtype_string) {
		free(token->val.string);
		token->val.string = NULL;
	}
}

static int replace_token_range(struct tokens *dst, size_t start, size_t end, const struct tokens *with)
{
	const size_t len = with ? with->cnt : 0;
	size_t i;
	cli_dbgmsg(MODULE "Replacing tokens %lu - %lu with %lu tokens\n",start, end, len);
	if(start >= dst->cnt || end > dst->cnt)
		return -1;
	for(i=start;i<end;i++) {
		free_token(&dst->data[i]);
	}
	if(tokens_ensure_capacity(dst, dst->cnt - (end-start) + len) < 0)
		return CL_EMEM;
	memmove(&dst->data[start+len], &dst->data[end], (dst->cnt - end) * sizeof(dst->data[0]));
	if(with && len > 0) {
		memcpy(&dst->data[start], with->data, len * sizeof(dst->data[0]));
	}
	dst->cnt = dst->cnt - (end-start) + len;
	return CL_SUCCESS;
}

static int append_tokens(struct tokens *dst, const struct tokens *src)
{
	if(!dst || !src)
		return CL_ENULLARG;
	if(!dst->cnt)
		return CL_SUCCESS;
	if(tokens_ensure_capacity(dst, dst->cnt + src->cnt) == -1)
		return CL_EMEM;
	cli_dbgmsg(MODULE "Appending %lu tokens\n", src->cnt);
	memcpy(&dst->data[dst->cnt], src->data, src->cnt * sizeof(dst->data[0]));
	dst->cnt += src->cnt;
	return CL_SUCCESS;
}

static void decode_de(yystype *params[], struct text_buffer *txtbuf)
{
	const char *p = TOKEN_GET(params[0], cstring);
	const long a = TOKEN_GET(params[1], ival);
	/*const char *c = params[2];*/
	char *k = TOKEN_GET(params[3], string);
	/*const char *r = params[5];*/

	unsigned val=0;
	unsigned nsplit = 0;
	const char* o;
	const char **tokens;

	memset(txtbuf, 0, sizeof(*txtbuf));
	if(!p || !k )
		return;
	for(o = k; *o; o++) if(*o == '|') nsplit++;
	nsplit++;
	tokens = malloc(sizeof(char*)*nsplit);
	if(!tokens) {
		return;
	}
	cli_strtokenize(k,'|',nsplit, tokens);

	do {
		while(*p && !isalnum(*p)) {
			if(*p=='\\' && (p[1] == '\'' || p[1] == '\"'))
				p++;
			else
				textbuffer_putc(txtbuf, *p++);
		}
		if(!*p) break;
		val = 0;
		o = p;
		while(*p && isalnum(*p)) {
			unsigned x;
			unsigned char v = *p++;
			/* TODO: use a table here */
			if(v >= 'a') x = 10+v-'a';
			else if(v >= 'A') x = 36+v-'A';
			else x = v-'0';
			val = val*a+x;
		}
		if(val >= nsplit || !tokens[val] || !tokens[val][0])
			while(o!=p)
				textbuffer_putc(txtbuf, *o++);
		else	textbuffer_append(txtbuf, tokens[val]);
	} while (*p);
	free(tokens);
	textbuffer_append(txtbuf, "\0");
}

struct decode_result {
	struct text_buffer txtbuf;
	size_t pos_begin;
	size_t pos_end;
        unsigned append:1; /* 0: tokens are replaced with new token(s),
                            1: old tokens are deleted, new ones appended at the end */
};

static void handle_de(yystype *tokens, size_t start, const size_t cnt, const char *name, struct decode_result *res)
{
	/* find function decl. end */
	size_t i, nesting = 1, j;
	yystype* parameters [6];
	const size_t parameters_cnt = 6;

	for(i=start;i < cnt; i++) {
		if(tokens[i].type == TOK_FUNCTION) {
			if(TOKEN_GET(&tokens[i], scope))
				nesting++;
			else
				nesting--;
			if(!nesting)
				break;
		}
	}
	if(nesting)
		return;
	if(name) {
		/* find call to function */
		for(;i+2 < cnt; i++) {
			const char* token_val = TOKEN_GET(&tokens[i], cstring);
			if(tokens[i].type == TOK_IDENTIFIER_NAME &&
			   token_val &&
			   !strcmp(name, token_val) &&
			   tokens[i+1].type == TOK_PAR_OPEN) {

				i += 2;
				for(j = 0;j < parameters_cnt && i < cnt;j++) {
					parameters[j] = &tokens[i++];
					if(j != parameters_cnt-1)
						while (tokens[i].type != TOK_COMMA && i < cnt) i++;
					else
						while (tokens[i].type != TOK_PAR_CLOSE && i < cnt) i++;
					i++;
				}
				if(j == parameters_cnt)
					decode_de(parameters, &res->txtbuf);
			}
		}
	} else {
		while(i<cnt && tokens[i].type != TOK_PAR_OPEN) i++;
		++i;
		if(i >= cnt) return;
		/* TODO: move this v to another func */
				for(j = 0;j < parameters_cnt && i < cnt;j++) {
					parameters[j] = &tokens[i++];
					if(j != parameters_cnt-1)
						while (tokens[i].type != TOK_COMMA && i < cnt) i++;
					else
						while (tokens[i].type != TOK_PAR_CLOSE && i < cnt) i++;
					i++;
				}
				if(j == parameters_cnt)
					decode_de(parameters, &res->txtbuf);
	}
	res->pos_begin = parameters[0] - tokens;
	res->pos_end = parameters[parameters_cnt-1] - tokens + 1;
	if(tokens[res->pos_end].type == TOK_BRACKET_OPEN &&
			tokens[res->pos_end+1].type == TOK_BRACKET_CLOSE &&
			tokens[res->pos_end+2].type == TOK_PAR_CLOSE)
		res->pos_end += 3; /* {}) */
	else
		res->pos_end++; /* ) */
}

/* --------- this should be in str.c -------------------------------- */
static const int hex_chars[256] = {
    -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
    -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
    -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
     0, 1, 2, 3,  4, 5, 6, 7,  8, 9,-1,-1, -1,-1,-1,-1,
    -1,10,11,12, 13,14,15,-1, -1,-1,-1,-1, -1,-1,-1,-1,
    -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
    -1,10,11,12, 13,14,15,-1, -1,-1,-1,-1, -1,-1,-1,-1,
    -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
    -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
    -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
    -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
    -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
    -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
    -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
    -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
    -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
};

static inline int cli_hex2int(const char c)
{
	return hex_chars[(const unsigned char)c];
}

static inline size_t output_utf8(uint16_t u, unsigned char* dst)
{
	if(!u) {
		*dst = 0x1; /* don't add \0, add \1 instead */
		return 1;
	}
	if(u < 0x80) {
		*dst = u&0xff;
		return 1;
	}
	if(u < 0x800) {
		*dst++ = 0xc0 | (u>>6);   /* 110yyyyy */
		*dst = 0x80 | (u & 0x3f); /* 10zzzzzz */
		return 2;
	}
	/* u < 0x10000 because we only handle utf-16,
	 * values in range 0xd800 - 0xdfff aren't valid, but we don't check for
	 * that*/
	*dst++ = 0xe0 | (u>>12);        /* 1110xxxx */
	*dst++ = 0x80 | ((u>>6)&0x3f); /* 10yyyyyy */
	*dst = 0x80 | (u & 0x3f);      /* 10zzzzzz */
	return 3;
}

static void textbuffer_append_normalize(struct text_buffer *buf, const char *str, size_t len)
{
	size_t i;
	for(i=0;i < len;i++) {
		char c = str[i];
		if (c == '\\' && i+1 < len) {
			i++;
			switch (str[i]) {
				case '0':
					c = 0;
					break;
				case 'b':
					c = 8;
					break;
				case 't':
					c = 9;
					break;
				case 'n':
					c = 10;
					break;
				case 'v':
					c = 11;
					break;
				case 'f':
					c = 12;
					break;
				case 'r':
					c=13;
					break;
				case 'x':
					if(i+2 < len)
						c = (cli_hex2int(str[i+1])<<4)|cli_hex2int(str[i+2]);
					i += 2;
					break;
				case 'u':
					if(i+4 < len) {
						uint16_t u = (cli_hex2int(str[i+1])<<12) | (cli_hex2int(str[i+2])<<8) |
							(cli_hex2int(str[i+3])<<4) | cli_hex2int(str[i+4]);
						textbuffer_ensure_capacity(buf, 4);
						buf->pos += output_utf8(u, (unsigned char*)buf->data);
						i += 4;
						continue;
					}
					break;
				default:
					c = str[i];
					break;
			}
		}
		if(!c) c = 1; /* we don't insert \0 */
		textbuffer_putc(buf, c);
	}
}


static char *cli_unescape(const char *str)
{
	char *R;
	size_t k, i=0;
	const size_t len = strlen(str);
	/* unescaped string is at most as long as original,
	 * it will usually be shorter */
	R = cli_malloc(len + 1);
	for(k=0;k < len;k++) {
		unsigned char c = str[k];
		if (str[k] == '%') {
			if(k+5 >= len || str[k+1] != 'u' || !isxdigit(str[k+2]) || !isxdigit(str[k+3])
						|| !isxdigit(str[k+4]) || !isxdigit(str[k+5])) {
				if(k+2 < len && isxdigit(str[k+1]) && isxdigit(str[k+2])) {
					c = (cli_hex2int(str[k+1])<<4) | cli_hex2int(str[k+2]);
					k += 2;
				}
			} else {
				uint16_t u = (cli_hex2int(str[k+2])<<12) | (cli_hex2int(str[k+3])<<8) |
					(cli_hex2int(str[k+4])<<4) | cli_hex2int(str[k+5]);
				i += output_utf8(u, (unsigned char*)&R[i]);
				k += 5;
				continue;
			}
		}
		if(!c) c = 1; /* don't add \0 */
		/* TODO: if c >= 0x80 output UTF-8, and do the same in
		 * normalize_string, and interpret the full %u sequence ! */
		R[i++] = c;
	}
	R[i++] = '\0';
	R = cli_realloc(R, i);
	return R;
}

/* ------------ end of str.c ----------------- */

static int handle_unescape(struct tokens *tokens, size_t start, const size_t cnt)
{
	if(tokens->data[start].type == TOK_StringLiteral) {
		char *R;
		struct tokens new_tokens;
		yystype tok;

		R = cli_unescape(TOKEN_GET(&tokens->data[start], cstring));
		tok.type = TOK_StringLiteral;
		TOKEN_SET(&tok, string, R);
		new_tokens.capacity = new_tokens.cnt = 1;
		new_tokens.data = &tok;
		if(replace_token_range(tokens, start-2, start+2, &new_tokens) < 0)
			return CL_EMEM;
	}
	return CL_SUCCESS;
}


/* scriptasylum dot com's JS encoder */
static void handle_df(const yystype *tokens, size_t start, const size_t cnt, struct decode_result *res)
{
	char *str, *s1;
	size_t len, s1_len, i;
	unsigned char clast;
	char *R;

	if(tokens[start].type != TOK_StringLiteral)
		return;
	str = TOKEN_GET(&tokens[start], string);
	if(!str)
		return;
	len = strlen(str);
	clast = str[len-1] - '0';

	str[len-1] = '\0';
	s1 = cli_unescape(str);
	s1_len = strlen(s1);
	for(i=0;i<s1_len;i++) {
		s1[i] -= clast;
	}
	R = cli_unescape(s1);
	free(s1);
	res->pos_begin = start-2;
	res->pos_end = start+2;
	res->txtbuf.data = R;
	res->txtbuf.pos = strlen(R);
	res->append = 1;
}


static void handle_eval(struct tokens *tokens, size_t start, struct decode_result *res)
{
	res->txtbuf.data = TOKEN_GET(&tokens->data[start], string);
	if(res->txtbuf.data && tokens->data[start+1].type == TOK_PAR_CLOSE) {
		TOKEN_SET(&tokens->data[start], string, NULL);
		res->txtbuf.pos = strlen(res->txtbuf.data);
		res->pos_begin = start-2;
		res->pos_end = start+2;
	}
}

static void run_folders(struct tokens *tokens)
{
  size_t i;

  for(i = 0; i < tokens->cnt; i++) {
	  const char *cstring = TOKEN_GET(&tokens->data[i], cstring);
	  if(i+2 < tokens->cnt && tokens->data[i].type == TOK_IDENTIFIER_NAME &&
		    cstring &&
		    !strcmp("unescape", cstring) && tokens->data[i+1].type == TOK_PAR_OPEN) {

		  handle_unescape(tokens, i+2, tokens->cnt);
	  }
  }
}

static inline int state_update_scope(struct parser_state *state, const yystype *token)
{
	if(token->type == TOK_FUNCTION) {
		struct scope *scope = TOKEN_GET(token, scope);
		if(scope) {
			state->current = scope;
		}
		else {
			/* dummy token marking function end */
			if(state->current->parent)
				state->current = state->current->parent;
			/* don't output this token, it is just a dummy marker */
			return 0;
		}
	}
	return 1;
}

static void run_decoders(struct parser_state *state)
{
  size_t i;
  const char* name;
  struct tokens *tokens = &state->tokens;

  for(i = 0; i < tokens->cnt; i++) {
	  const char *cstring = TOKEN_GET(&tokens->data[i], cstring);
	  struct decode_result res;
	  res.pos_begin = res.pos_end = 0;
	  res.append = 0;
	  if(tokens->data[i].type == TOK_FUNCTION && i+13 < tokens->cnt) {
		  name = NULL;
		  ++i;
		  if(tokens->data[i].type == TOK_IDENTIFIER_NAME) {
			  name = cstring;
			  ++i;
		  }
		  if(match_parameters(&tokens->data[i], de_packer_3, sizeof(de_packer_3)/sizeof(de_packer_3[0])) != -1
		     || match_parameters(&tokens->data[i], de_packer_2, sizeof(de_packer_2)/sizeof(de_packer_2[0])) != -1)  {
			  /* find function decl. end */
			  handle_de(tokens->data, i, tokens->cnt, name, &res);
		  }
	  } else if(i+2 < tokens->cnt && tokens->data[i].type == TOK_IDENTIFIER_NAME &&
		    cstring &&
		    !strcmp("dF", cstring) && tokens->data[i+1].type == TOK_PAR_OPEN) {
		  /* TODO: also match signature of dF function (possibly
		   * declared using unescape */

		  handle_df(tokens->data, i+2, tokens->cnt, &res);
	  } else if(i+2 < tokens->cnt && tokens->data[i].type == TOK_IDENTIFIER_NAME &&
			  cstring &&
			  !strcmp("eval", cstring) && tokens->data[i+1].type == TOK_PAR_OPEN) {
		  handle_eval(tokens, i+2, &res);
	  }
	if(res.pos_end > res.pos_begin) {
		struct tokens parent_tokens;
		if(res.pos_end < tokens->cnt && tokens->data[res.pos_end].type == TOK_SEMICOLON)
			res.pos_end++;
		parent_tokens = state->tokens;/* save current tokens */
		/* initialize embedded context */
		memset(&state->tokens, 0, sizeof(state->tokens));
		cli_js_process_buffer(state, res.txtbuf.data, res.txtbuf.pos);
		free(res.txtbuf.data);
		/* state->tokens still refers to the embedded/nested context
		 * here */
		if(!res.append) {
			replace_token_range(&parent_tokens, res.pos_begin, res.pos_end, &state->tokens);
		} else {
			/* delete tokens */
			replace_token_range(&parent_tokens, res.pos_begin, res.pos_end, NULL);
			append_tokens(&parent_tokens, &state->tokens);
		}
		/* end of embedded context, restore tokens state */
		free(state->tokens.data);
		state->tokens = parent_tokens;
	}
	  state_update_scope(state, &state->tokens.data[i]);
  }
}

void cli_js_parse_done(struct parser_state* state)
{
	run_folders(&state->tokens);
	run_decoders(state);

	yylex_destroy(state->scanner);
	state->global = NULL; /* make this state invalid for parsing */
}


void cli_js_output(struct parser_state *state)
{
	unsigned i;
	struct buf buf;
	char lastchar = '\0';
	buf.pos = 0;
	buf.outfd = STDOUT_FILENO;
	state->current = state->global;
	for(i = 0; i < state->tokens.cnt; i++) {
		if(state_update_scope(state, &state->tokens.data[i]))
			lastchar = output_token(&state->tokens.data[i], state->current, &buf, lastchar);
	}
	if(write(buf.outfd, buf.buf, buf.pos) < 0) {
		cli_dbgmsg(MODULE "I/O error");
	}
}

void cli_js_destroy(struct parser_state *state)
{
	size_t i;
	scope_free_all(state->list);
	for(i=0;i<state->tokens.cnt;i++) {
		free_token(&state->tokens.data[i]);
	}
	free(state->tokens.data);
}

/* buffer is html-normlike "chunk", if original file is bigger than buffer,
 * we rewind to a space, so we'll know that tokens won't be broken in half at
 * the end of a buffer. All tokens except string-literals of course.
 * So we can assume that after the buffer there is either a space, EOF, or a
 * chunk of text not containing whitespace at all (for which we care only if its
 * a stringliteral)*/
void cli_js_process_buffer(struct parser_state *state, const char *buf, size_t n)
{
	struct scope* current = state->current;
	YYSTYPE val;
	int yv;
	YY_BUFFER_STATE yyb;

	if(!state->global) {
		/* this state has either not been initialized,
		 * or cli_js_parse_done() was already called on it */
		cli_warnmsg(MODULE "invalid state");
		return;
	}
	yyb = yy_scan_bytes(buf, n, state->scanner);
	memset(&val, 0, sizeof(val));
	val.vtype = vtype_undefined;
	/* on EOF yylex will return 0 */
	while( (yv=yylex(&val, state->scanner)) != 0)
	{
		const char *text;
		size_t leng;

		val.type = yv;
		switch(yv) {
			case TOK_VAR:
				current->fsm_state = InsideVar;
				break;
			case TOK_IDENTIFIER_NAME:
				text = yyget_text(state->scanner);
				leng = yyget_leng(state->scanner);
				if(current->last_token == TOK_DOT) {
					/* this is a member name, don't normalize
					*/
					TOKEN_SET(&val, string, cli_strdup(text));
					val.type = TOK_UNNORM_IDENTIFIER;
				} else {
					switch(current->fsm_state) {
						case WaitParameterList:
							state->syntax_errors++;
							/* fall through */
						case Base:
						case InsideInitializer:
							TOKEN_SET(&val, cstring, scope_use(current, text, leng));
							break;
						case InsideVar:
						case InsideFunctionDecl:
							TOKEN_SET(&val, cstring, scope_declare(current, text, leng, state));
							current->fsm_state = InsideInitializer;
							current->brackets = 0;
							break;
						case WaitFunctionName:
							TOKEN_SET(&val, cstring, scope_declare(current, text, leng, state));
							current->fsm_state = WaitParameterList;
							break;
					}
				}
				break;
			case TOK_PAR_OPEN:
				switch(current->fsm_state) {
					case WaitFunctionName:
						/* TODO: function name is null */
						/* fallthrough */
					case WaitParameterList:
						current->fsm_state = InsideFunctionDecl;
						break;
					default:
						/* noop */
						break;
				}
				break;
			case TOK_PAR_CLOSE:
				switch(current->fsm_state) {
					case WaitFunctionName:
						state->syntax_errors++;
						break;
					case WaitParameterList:
						current->fsm_state = Base;
						break;
					default:
						/* noop */
						break;
				}
				break;
			case TOK_CURLY_BRACE_OPEN:
				switch(current->fsm_state) {
					case WaitFunctionName:
						/* TODO: function name is null */
						/* fallthrough */
					case WaitParameterList:
					case InsideFunctionDecl:
						/* in a syntactically correct
						 * file, we would already be in
						 * the Base state when we see a {
						 */
						current->fsm_state = Base;
						/* fall-through */
					case InsideVar:
					case InsideInitializer:
						state->syntax_errors++;
						/* fall-through */
					case Base:
					default:
						current->blocks++;
						break;
				}
				break;
					case TOK_CURLY_BRACE_CLOSE:
				if(current->blocks > 0)
					current->blocks--;
				else
					state->syntax_errors++;
				if(!current->blocks) {
					if(current->parent) {
						/* add dummy FUNCTION token to
						 * mark function end */
						TOKEN_SET(&val, cstring, "}");
						add_token(state, &val);
						TOKEN_SET(&val, scope, NULL);
						val.type = TOK_FUNCTION;

						state->current = current = current->parent;
					} else{
						/* extra } */
						state->syntax_errors++;
				}
				}
				break;
			case TOK_BRACKET_OPEN:
				current->brackets++;
				break;
			case TOK_BRACKET_CLOSE:
				if(current->brackets > 0)
					current->brackets--;
				else
					state->syntax_errors++;
				break;
			case TOK_COMMA:
				if (current->fsm_state == InsideInitializer && current->brackets == 0 && current->blocks == 0) {
					/* initializer ended only if we
					 * encountered a comma, and [] are
					 * balanced.
					 * This avoids switching state on:
					 * var x = [4,y,u];*/
					current->fsm_state = InsideVar;
				}
				break;
			case TOK_SEMICOLON:
				if (current->brackets == 0 && current->blocks == 0) {
					/* avoid switching state on unbalanced []:
					 * var x = [test;testi]; */
					current->fsm_state = Base;
				}
				break;
			case TOK_FUNCTION:
				current = scope_new(state);
				current->fsm_state = WaitFunctionName;
				TOKEN_SET(&val, scope, state->current);
				break;
			case TOK_StringLiteral:
				if(state->tokens.data[state->tokens.cnt-1].type == TOK_PLUS) {
					/* see if can fold */
					yystype *prev_string = &state->tokens.data[state->tokens.cnt-2];
					if(prev_string->type == TOK_StringLiteral) {
						char *str = TOKEN_GET(prev_string, string);
						size_t str_len = strlen(str);

						text = yyget_text(state->scanner);
						leng = yyget_leng(state->scanner);


						/* delete TOK_PLUS */
						free_token(&state->tokens.data[--state->tokens.cnt]);

						str = cli_realloc(str, str_len + leng + 1);
						strncpy(str+str_len, text+1, leng);
						str[str_len + leng] = '\0';
						TOKEN_SET(prev_string, string, str);
						free(val.val.string);
						memset(&val, 0, sizeof(val));
						val.vtype = vtype_undefined;
						continue;
					}
				}
				break;
		}
		if(val.vtype == vtype_undefined) {
			text = yyget_text(state->scanner);
			/* TODO: tokenizer should set it to point to a constant
			 * string, it currently doesn't do that for operators ,;:=... */
			TOKEN_SET(&val, string, cli_strdup(text));
			abort();
		}
		add_token(state, &val);
		current->last_token = yv;
		memset(&val, 0, sizeof(val));
		val.vtype = vtype_undefined;
	}
	yy_delete_buffer(yyb, state->scanner);
}

int cli_js_init(struct parser_state *state)
{
	if(!state)
		return CL_ENULLARG;
	memset(state, 0, sizeof(*state));
	if(!scope_new(state)) {
		return CL_EMEM;
	}
	state->global = state->current;

	if(yylex_init(&state->scanner)) {
		scope_done(state->global);
		return CL_EMEM;
	}
	yyset_debug(1, state->scanner);
	return CL_SUCCESS;
}

int main(int argc,char** argv)
{
	int n;
	char buf[BUFS+2];
	struct parser_state state;

	/*cli_debug_flag=1;*/
	printf("Enter javascript:\n");
	printf("  Terminate with ^D\n");

	cli_js_init(&state);
	while ( ( n=read(fileno(stdin), buf, BUFS )) >  0)
	{
		/*buf[n] = '\0';*/
		cli_js_process_buffer(&state, buf, n);
	}
	cli_js_parse_done(&state);
	cli_js_output(&state);
	cli_js_destroy(&state);
	return 0;
}

/* TODO: special identifiers in global scope (document, ...) 
 *
 * avoid extra strdup: 
 *  - when string is going to be folded
 *  - normalize_string, and hashtab_insert - avoid one
 *
 * decoded stuff should be parsed in the correct context (not a global one!)
 *
 * make unescape, packers handling more generic
 * memory leaks - manul check
 * check for allocation failure everywhere
 * limits
 * security check
 * */


/*-------------- tokenizer ---------------------*/
enum char_class {
	Whitespace,
	Slash,
	Operator,
	DQuote,
	SQuote,
	Digit,
	IdStart,
	BracketOpen = TOK_BRACKET_OPEN,
	BracketClose = TOK_BRACKET_CLOSE,
	Comma = TOK_COMMA,
	CurlyOpen = TOK_CURLY_BRACE_OPEN,
	CurlyClose = TOK_CURLY_BRACE_CLOSE,
	ParOpen = TOK_PAR_OPEN,
	ParClose = TOK_PAR_CLOSE,
	Dot = TOK_DOT,
	SemiColon = TOK_SEMICOLON,
	Nop
};

#define SL Slash
#define DG Digit
#define DQ DQuote
#define SQ SQuote
#define ID IdStart
#define OP Operator
#define WS Whitespace
#define BO BracketOpen
#define BC BracketClose
#define CM Comma
#define CO CurlyOpen
#define CC CurlyClose
#define PO ParOpen
#define PC ParClose
#define DT Dot
#define SC SemiColon
#define NA Nop

static const enum char_class ctype[256] = {
	NA, NA, NA, NA, NA, NA, NA, NA, NA, WS, WS, WS, NA, WS, NA, NA,
	NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
	WS, OP, DQ, NA, ID, OP, OP, SQ, PO, PC, OP, OP, CM, OP, DT, SL,
	DG, DG, DG, DG, DG, DG, DG, DG, DG, DG, OP, SC, OP, OP, OP, OP,
	NA, ID, ID, ID, ID, ID, ID, ID, ID, ID, ID, ID, ID, ID, ID, ID,
	ID, ID, ID, ID, ID, ID, ID, ID, ID, ID, ID, BO, ID, BC, OP, ID,
	NA, ID, ID, ID, ID, ID, ID, ID, ID, ID, ID, ID, ID, ID, ID, ID,
	ID, ID, ID, ID, ID, ID, ID, ID, ID, ID, ID, CO, OP, CC, OP, NA,
	NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
	NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
	NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
	NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
	NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
	NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
	NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
	NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA
};

static const enum char_class id_ctype[256] = {
	NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
        NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
        NA, NA, NA, NA, ID, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
        ID, ID, ID, ID, ID, ID, ID, ID, ID, ID, NA, NA, NA, NA, NA, NA,
        NA, ID, ID, ID, ID, ID, ID, ID, ID, ID, ID, ID, ID, ID, ID, ID,
        ID, ID, ID, ID, ID, ID, ID, ID, ID, ID, ID, NA, OP, NA, NA, ID,
        NA, ID, ID, ID, ID, ID, ID, ID, ID, ID, ID, ID, ID, ID, ID, ID,
        ID, ID, ID, ID, ID, ID, ID, ID, ID, ID, ID, NA, NA, NA, NA, NA,
        NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
        NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
        NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
        NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
        NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
        NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
        NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
        NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
};

#define CASE_SPECIAL_CHAR(C, S) case C: TOKEN_SET(lvalp, cstring, (S)); return cClass;

#define BUF_KEEP_SIZE 32768

static void textbuf_clean(struct text_buffer *buf)
{
	if(buf->capacity > BUF_KEEP_SIZE) {
		buf->data = cli_realloc(buf->data, BUF_KEEP_SIZE);
		buf->capacity = BUF_KEEP_SIZE;
	}
	buf->pos = 0;
}


static inline int parseString(YYSTYPE *lvalp, yyscan_t scanner, const char q,
		enum tokenizer_state tostate)
{
	size_t len;
	/* look for " terminating the string */
	const char *start = &scanner->in[scanner->pos], *end = start;
	do {
		const size_t siz = &scanner->in[scanner->insize] - end;
		end = memchr(end, q, siz);
		if(end && end[-1] == '\\') {
			++end;
			continue;
		}
		break;
	} while (1);
	len = (end && end > start) ? end - start : scanner->insize - scanner->pos;
	textbuffer_append_normalize(&scanner->buf, start, len);
	if(end) {
		/* skip over end quote */
		scanner->pos += len + 1;
		textbuffer_putc(&scanner->buf, '\0');
		TOKEN_SET(lvalp, string, textbuffer_done(scanner));
		scanner->state = Initial;
		assert(lvalp->val.string);
		return TOK_StringLiteral;
	} else {
		scanner->pos += len;
		/* unfinished string */
		scanner->state = tostate;
		return 0;
	}
}

static inline int parseDQString(YYSTYPE *lvalp, yyscan_t scanner)
{
	return parseString(lvalp, scanner, '"', DoubleQString);
}

static inline int parseSQString(YYSTYPE *lvalp, yyscan_t scanner)
{
	return parseString(lvalp, scanner, '\'', SingleQString);
}

static inline int parseNumber(YYSTYPE *lvalp, yyscan_t scanner)
{
	const unsigned char *in = (const unsigned char*)scanner->in;
	int is_float = 0;
	while(scanner->pos < scanner->insize) {
		unsigned char c = in[scanner->pos++];
		if(isdigit(c)) {
			textbuffer_putc(&scanner->buf, c);
			continue;
		}
		if(c =='.' && !is_float) {
			is_float = 1;
			textbuffer_putc(&scanner->buf, '.');
			continue;
		}
		if((c=='e' || c=='E') && is_float) {
			textbuffer_putc(&scanner->buf, c);
			if(scanner->pos < scanner->insize) {
				c = in[scanner->pos++];
				if(c == '+' || c == '-' || isdigit(c)) {
					textbuffer_putc(&scanner->buf, c);
					continue;
				}
			}
		}
		scanner->pos--;
		textbuffer_putc(&scanner->buf, '\0');
		scanner->state = Initial;
		if(is_float) {
			TOKEN_SET(lvalp, dval, atof(scanner->buf.data));
			return TOK_NumericFloat;
		} else {
			TOKEN_SET(lvalp, ival, atoi(scanner->buf.data));
			return TOK_NumericInt;
		}
	}
	scanner->state = Number;
	return 0;
}

static inline int parseId(YYSTYPE *lvalp, yyscan_t scanner)
{
	const struct keyword *kw;
	const unsigned char *in = (const unsigned char*)scanner->in;
	scanner->state = Initial;
	while(scanner->pos < scanner->insize) {
		unsigned char c = in[scanner->pos++];
		enum char_class cClass = id_ctype[c];
		switch(cClass) {
			case IdStart:
				textbuffer_putc(&scanner->buf, c);
				break;
			case Operator:
				/* the table contains OP only for \ */
				assert(c == '\\');
				if(scanner->pos < scanner->insize &&
						in[scanner->pos++] == 'u') {
					textbuffer_putc(&scanner->buf, c);
					break;
				}
				/* else fallthrough */
			default:
				/* character is no longer part of identifier */
				textbuffer_putc(&scanner->buf, '\0');
				scanner->pos--;
				kw = in_word_set(scanner->buf.data, scanner->buf.pos-1);
				if(kw) {
					/* we got a keyword */
					TOKEN_SET(lvalp, cstring, kw->name);
					return kw->val;
				}
				/* it is not a keyword, just an identifier */
				TOKEN_SET(lvalp, cstring, NULL);
				return TOK_IDENTIFIER_NAME;
		}
	}
	scanner->state = Identifier;
	return 0;
}

#ifndef MIN
#define MIN(a,b) ((a)<(b) ? (a):(b))
#endif

static int parseOperator(YYSTYPE *lvalp, yyscan_t scanner)
{
	size_t len = MIN(5, scanner->insize - scanner->pos);
	while(len) {
		const struct operator *kw = in_op_set(&scanner->in[scanner->pos], len);
		if(kw) {
			TOKEN_SET(lvalp, cstring, kw->name);
			scanner->pos += len;
			return kw->val;
		}
		len--;
	}
	scanner->pos++;
	TOKEN_SET(lvalp, cstring, NULL);
	return TOK_ERROR;
}

static int yylex_init(yyscan_t *scanner)
{
	*scanner = cli_calloc(1, sizeof(**scanner));
	return *scanner ? 0 : -1;
}

static int yylex_destroy(yyscan_t scanner)
{
	free(scanner->buf.data);
	free(scanner);
	return 0;
}

static int yy_scan_bytes(const char *p, size_t len, yyscan_t scanner)
{
	scanner->in = p;
	scanner->insize = len;
	scanner->pos = 0;
	return 0;
}

static void yyset_debug (int debug_flag ,yyscan_t yyscanner )
{
	/* TODO */
}

static void yy_delete_buffer( YY_BUFFER_STATE yyb, yyscan_t scanner)
{
	/* TODO */
}

static const char *yyget_text(yyscan_t scanner)
{
	assert(scanner->buf.data || scanner->yytext);
	return scanner->yytext ? scanner->yytext : scanner->buf.data;
}

static int yyget_leng(yyscan_t scanner)
{
	/* we have a \0 too */
	return scanner->yylen ? scanner->yylen : (scanner->buf.pos > 0 ? scanner->buf.pos - 1 : 0);
}

static int yylex(YYSTYPE *lvalp, yyscan_t  scanner)
{
	const size_t len = scanner->insize;
	const unsigned char *in = (const unsigned char*)scanner->in;
	unsigned char lookahead;
	enum char_class cClass;

	scanner->yytext = NULL;
	scanner->yylen = 0;
	while(scanner->pos < scanner->insize) {
		switch(scanner->state) {
			case Initial:
				textbuf_clean(&scanner->buf);
				cClass = ctype[in[scanner->pos++]];
				switch(cClass) {
					case Whitespace:
						/* eat whitespace */
						continue;
					case Slash:
						if(scanner->pos < len) {
							lookahead = in[scanner->pos];
							switch(lookahead) {
								case '*':
									scanner->state = MultilineComment;
									scanner->pos++;
									continue;
								case '/':
									scanner->state = SinglelineComment;
									scanner->pos++;
									continue;
							}
						}
						--scanner->pos;
						return parseOperator(lvalp, scanner);
					case Operator:
						--scanner->pos;
						return parseOperator(lvalp, scanner);
					case DQuote:
						return parseDQString(lvalp, scanner);
					case SQuote:
						return parseSQString(lvalp, scanner);
					case Digit:
						--scanner->pos;
						return parseNumber(lvalp, scanner);
					case IdStart:
						--scanner->pos;
						return parseId(lvalp,scanner);
					CASE_SPECIAL_CHAR(BracketOpen, "[");
					CASE_SPECIAL_CHAR(BracketClose, "]");
					CASE_SPECIAL_CHAR(Comma, ",");
					CASE_SPECIAL_CHAR(CurlyOpen, "{");
					CASE_SPECIAL_CHAR(CurlyClose, "}");
					CASE_SPECIAL_CHAR(ParOpen, "(");
					CASE_SPECIAL_CHAR(ParClose, ")");
					CASE_SPECIAL_CHAR(Dot, ".");
					CASE_SPECIAL_CHAR(SemiColon, ";");
					case Nop:
					       continue;
				}
				break;
			case DoubleQString:
				return parseString(lvalp, scanner, '"', DoubleQString);
			case SingleQString:
				return parseString(lvalp, scanner, '\'', DoubleQString);
			case Identifier:
				return parseId(lvalp, scanner);
			case MultilineComment:
				while(scanner->pos+1 < scanner->insize) {
					if(in[scanner->pos] == '*' && in[scanner->pos+1] == '/')
						break;
					scanner->pos++;
				}
				scanner->state = Initial;
				break;
			case Number:
				return parseNumber(lvalp, scanner);
			case SinglelineComment:
				while(scanner->pos < scanner->insize) {
					if(in[scanner->pos] == '\n')
						break;
					scanner->pos++;
				}
				scanner->state = Initial;
				break;
		}
	}
	return 0;
}