clamav/libclamav/regex_list.c

/*
 *  Match a string against a list of patterns/regexes.
 *
 *  Copyright (C) 2007-2008 Sourcefire, Inc.
 *
 *  Authors: Török Edvin
 *
 *  This program is free software; you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License version 2 as
 *  published by the Free Software Foundation.
 *
 *  This program is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with this program; if not, write to the Free Software
 *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
 *  MA 02110-1301, USA.
 */

#if HAVE_CONFIG_H
#include "clamav-config.h"
#endif

#ifndef CL_DEBUG
#define NDEBUG
#endif

#ifdef CL_THREAD_SAFE
#ifndef _REENTRANT
#define _REENTRANT
#endif
#endif

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>
#include <zlib.h>

#include <limits.h>
#include <sys/types.h>
#include <assert.h>


#include "regex/regex.h"


#include "clamav.h"
#include "others.h"
#include "regex_list.h"
#include "matcher-ac.h"
#include "matcher.h"
#include "str.h"
#include "readdb.h"
#include "jsparse/textbuf.h"

/* ------- parse a regular expression, and extract a static suffix ------*/
enum node_type {
	root=0,
	concat,
	alternate, /* | */
	optional,/* ?, * */
	leaf, /* a character */
	leaf_class /* character class */
	/* (x)+ is transformed into (x)*(x) */
};

struct node {
	enum node_type type;
	struct node *parent;
	union {
		struct {
			struct node* left;
			struct node* right;
		} children;
		uint8_t*    leaf_class_bitmap;
		uint8_t     leaf_char;
	} u;
};

/* Prototypes */
static size_t reverse_string(char *pattern);
static int add_pattern(struct regex_matcher *matcher, char *pattern);
static int add_pattern_suffix(struct regex_matcher *matcher, char *suffix, size_t suffix_len, struct regex_list *regex);
static int add_static_pattern(struct regex_matcher *matcher, char* pattern);
static int build_suffixtree_descend(struct regex_matcher *matcher, struct regex_list *regex, struct node *n, struct text_buffer *buf);
/* ---------- */

static uint8_t dot_bitmap[32];

static struct node* make_node(enum node_type type, struct node *left, struct node *right)
{
	struct node *n;
	if(type == concat) {
		if(left == NULL)
			return right;
		if(right == NULL)
			return left;
	}
	n = cli_malloc(sizeof(*n));
	if(!n)
		return NULL;
	n->type = type;
	n->parent = NULL;
	n->u.children.left = left;
	n->u.children.right = right;
	if(left)
		left->parent = n;
	if(right)
		right->parent = n;
	return n;
}

static struct node *dup_node(struct node *p)
{
	struct node *node_left, *node_right;
	struct node *d;

	if(!p)
		return NULL;
	d = cli_malloc(sizeof(*d));
	if(!d)
		return NULL;
	d->type = p->type;
	d->parent = NULL;
	switch(p->type) {
		case leaf:
			d->u.leaf_char = p->u.leaf_char;
			break;
		case leaf_class:
			d->u.leaf_class_bitmap = cli_malloc(32);
			if(!d->u.leaf_class_bitmap)
				return NULL;
			memcpy(d->u.leaf_class_bitmap, p->u.leaf_class_bitmap, 32);
			break;
		default:
			node_left = dup_node(p->u.children.left);
			node_right = dup_node(p->u.children.right);
			d->u.children.left = node_left;
			d->u.children.right = node_right;
			if(node_left)
				node_left->parent = d;
			if(node_right)
				node_right->parent = d;
			break;
	}
	return d;
}

static struct node *make_charclass(uint8_t *bitmap)
{
	struct node *v = cli_malloc(sizeof(*v));
	if(!v)
		return NULL;
	v->type = leaf_class;
	v->parent = NULL;
	v->u.leaf_class_bitmap = bitmap;
	return v;
}

static struct node *make_leaf(char c)
{
	struct node *v = cli_malloc(sizeof(*v));
	if(!v)
		return NULL;
	v->type = leaf;
	v->parent = NULL;
	v->u.leaf_char = c;
	return v;
}

static void destroy_tree(struct node *n)
{
	if(!n)
		return;
	switch(n->type) {
		case concat:
		case alternate:
		case optional:
			destroy_tree(n->u.children.left);
			destroy_tree(n->u.children.right);
			break;
		case leaf_class:
			if(n->u.leaf_class_bitmap != dot_bitmap)
			  free(n->u.leaf_class_bitmap);
			break;
		case root:
		case leaf:
			break;
	}
	free(n);
}

static uint8_t* parse_char_class(const char *pat, size_t *pos)
{
	unsigned char range_start=0;
	int hasprev = 0;
	uint8_t* bitmap = cli_malloc(32);
	if(!bitmap)
		return NULL;
	if (pat[*pos]=='^') {
		memset(bitmap,0xFF,32);/*match chars not in brackets*/
		++*pos;
	}
	else
		memset(bitmap,0x00,32);
	do {
		/* literal ] can be first character, so test for it at the end of the loop, for example: []] */
		if (pat[*pos]=='-' && hasprev) {
			/* it is a range*/
			unsigned char range_end;
			unsigned int c;
			assert(range_start);
			++*pos;
			if (pat[*pos]=='[')
				if (pat[*pos+1]=='.') {
					/* collating sequence not handled */
					free(bitmap);
					/* we are parsing the regex for a
					 * filter, be conservative and
					 * tell the filter that anything could
					 * match here */
					while(pat[*pos] != ']') ++*pos;
					++*pos;
					while(pat[*pos] != ']') ++*pos;
					return dot_bitmap;
				}
				else
					range_end = pat[*pos];
			else
				range_end = pat[*pos];
			for(c=range_start+1;c<=range_end;c++)
				bitmap[c>>3] ^= 1<<(c&0x7);
			hasprev = 0;
		}
		else if (pat[*pos]=='[' && pat[*pos]==':') {
			/* char class */
			free(bitmap);
			while(pat[*pos] != ']') ++*pos;
			++*pos;
			while(pat[*pos] != ']') ++*pos;
			return dot_bitmap;
		} else {
			bitmap[pat[*pos]>>3] ^= 1<<(pat[*pos]&0x7);
			++*pos;
			range_start = pat[*pos];
			hasprev = 1;
		}
	} while(pat[*pos]!=']');
	return bitmap;
}

static struct node* parse_regex(const char *p, size_t *last)
{
	struct node *v = NULL;
	struct node *right;
	struct node *tmp;

	while(p[*last] != '$' && p[*last] != '\0') {
		switch(p[*last]) {
			case '|':
				++*last;
				right = parse_regex(p, last);
				v = make_node(alternate, v, right);
				if(!v)
					return NULL;
				break;
			case '*':
			case '?':
				v = make_node(optional, v, NULL);
				if(!v)
					return NULL;
				++*last;
				break;
			case '+':
				/* (x)* */
				tmp = make_node(optional, v, NULL);
				if(!tmp)
					return NULL;
				/* (x) */
				right = dup_node(v);
				if(!right)
					return NULL;
				/* (x)*(x) => (x)+ */
				v = make_node(concat, tmp, right);
				if(!v)
					return NULL;
				++*last;
				break;
			case '(':
				++*last;
				right = parse_regex(p, last);
				if(!right)
					return NULL;
				++*last;
				v = make_node(concat, v, right);
				break;
			case ')':
				return v;
			case '.':
				right = make_charclass(dot_bitmap);
				if(!right)
					return NULL;
				v = make_node(concat, v, right);
				if(!v)
					return NULL;
				++*last;
				break;
			case '[':
				right = make_charclass( parse_char_class(p, last) );
				if(!right)
					return NULL;
				v = make_node(concat, v, right);
				if(!v)
					return NULL;
			case '\\':
				/* next char is escaped, advance pointer
				 * and let fall-through handle it */
				++*last;
			default:
				right = make_leaf(p[*last]);
				v = make_node(concat, v, right);
				if(!v)
					return NULL;
				++*last;
				break;
		}
	}
	return v;
}

#define BITMAP_HASSET(b, i) (b[i>>3] & (1<<(i&7)))

static int build_suffixtree_ascend(struct regex_matcher *matcher, struct regex_list *regex, struct node *n, struct text_buffer *buf, struct node *prev)
{
	size_t i;
	while(n) {
		struct node *q = n;
		switch(n->type) {
			case root:
				textbuffer_putc(buf, '\0');
				if(add_pattern_suffix(matcher, buf->data, buf->pos, regex) < 0)
					return CL_EMEM;
				return 0;
			case leaf:
				textbuffer_putc(buf, n->u.leaf_char);
				n = n->parent;
				break;
			case leaf_class:
				if(memcmp(n->u.leaf_class_bitmap, dot_bitmap, sizeof(dot_bitmap)) == 0) {
					textbuffer_putc(buf, '\0');
					if(add_pattern_suffix(matcher, buf->data, buf->pos, regex) < 0)
						return CL_EMEM;
					return 0;
				}
				for(i=0;i<255;i++) {
					if(BITMAP_HASSET(n->u.leaf_class_bitmap, i)) {
						size_t pos;
						pos = buf->pos;
						textbuffer_putc(buf, i);
						if(build_suffixtree_ascend(matcher, regex, n->parent, buf, n) < 0)
							return CL_EMEM;
						buf->pos = pos;
					}
				}
				return 0;
			case concat:
				if(prev != n->u.children.left) {
					if(build_suffixtree_descend(matcher, regex, n->u.children.left, buf) < 0)
						return CL_EMEM;
					/* we're done here, descend will call
					 * ascend if needed */
					return 0;
				} else {
					n = n->parent;
				}
				break;
			case alternate:
				n = n->parent;
				break;
			case optional:
				textbuffer_putc(buf, '\0');
				if(add_pattern_suffix(matcher, buf->data, buf->pos, regex) < 0)
					return CL_EMEM;
				return 0;
		}
		prev = q;
	}
	return 0;
}

static int build_suffixtree_descend(struct regex_matcher *matcher, struct regex_list *regex, struct node *n, struct text_buffer *buf)
{
	size_t pos;
	while(n && n->type == concat) {
		n = n->u.children.right;
	}
	if(!n)
		return 0;
	/* find out end of the regular expression,
	 * if it ends with a static pattern */
	switch(n->type) {
		case alternate:
			/* save pos as restart point */
			pos = buf->pos;
			if(build_suffixtree_descend(matcher, regex, n->u.children.left, buf) < 0)
				return CL_EMEM;
			buf->pos = pos;
			if(build_suffixtree_descend(matcher, regex, n->u.children.right, buf) < 0)
				return CL_EMEM;
			buf->pos = pos;
			break;
		case optional:
			textbuffer_putc(buf, '\0');
			if(add_pattern_suffix(matcher, buf->data, buf->pos, regex) < 0)
				return CL_EMEM;
			return 0;
		case leaf:
		case leaf_class:
			if(build_suffixtree_ascend(matcher, regex, n, buf, NULL) < 0)
			        return CL_EMEM;
			return 0;
		default:
			break;
	}
	return 0;
}


/* ----- shift-or filtering -------------- */

#define BITMAP_CONTAINS(bmap, val) ((bmap)[(val) >> 5] & (1 << ((val) & 0x1f)))
#define BITMAP_INSERT(bmap, val) ((bmap)[(val) >> 5] |= (1 << ((val) & 0x1f)))

static void SO_init(struct filter *m)
{
	memset(m->B, ~0, sizeof(m->B));
	memset(m->end, ~0, sizeof(m->end));
	memset(m->end_fast, ~0, sizeof(m->end_fast));
}

/* because we use uint32_t */
#define MAXSOPATLEN 32

/* merge another pattern into the filter
 * add('abc'); add('bcd'); will match [ab][bc][cd] */
static int SO_preprocess_add(struct filter *m, const unsigned char *pattern, size_t len)
{
	uint16_t q;
	uint8_t j;

	/* cut length, and make it modulo 2 */
	if(len > MAXSOPATLEN) {
		len = MAXSOPATLEN;
	} else {
		/* we use 2-grams, must be multiple of 2 */
		len = len & ~1;
	}
	if(!len)
		return 0;

	/* Shift-Or like preprocessing */
	for(j=0;j < len-1;j++) {
		/* use overlapping 2-grams. We need them overlapping because matching can start at any position */
		q = cli_readint16( &pattern[j] );
		m->B[q] &= ~(1 << j);
	}
	/* we use variable length patterns, use last character to mark pattern end,
	 * can lead to false positives.*/
	/* mark that at state j, the q-gram q can end the pattern */
	if(j) {
		j--;
		m->end[q] &= ~(1 << j);
		m->end_fast[pattern[j]] &= (1<<j);
	}
	return 0;
}

/* this is like a FSM, with multiple active states at the same time.
 * each bit in "state" means an active state, when a char is encountered
 * we determine what states can remain active.
 * The FSM transition rules are expressed as bit-masks */
static long SO_search(const struct filter *m, const unsigned char *data, unsigned long len)
{
	size_t j;
	uint32_t state = ~0;
	const uint32_t *B = m->B;
	const uint32_t *End = m->end;
	const uint32_t *EndFast = m->end_fast;

	if(!len) return -1;
	/* Shift-Or like search algorithm */
	for(j=0;j < len-1; j++) {
		const uint16_t q0 = cli_readint16( &data[j] );
		uint32_t match_end;
		state = (state << 1) | B[q0];
		/* state marks with a 0 bit all active states
		 * End[q0] marks with a 0 bit all states where the q-gram 'q' can end a pattern
		 * if we got two 0's at matching positions, it means we encountered a pattern's end */
		match_end = state | EndFast[data[j+1]];
		if((match_end != 0xffffffff) && (state | End[q0]) !=  0xffffffff) {
			/* note: we rely on short-circuit eval here, we only evaluate and fetch End[q0], if
			 * end_fast has matched. This reduces cache pressure on End[], and allows us to keep the working
			 * set inside L2 */

			/* if state is reachable, and this character can finish a pattern, assume match */
			/* to reduce false positives check if qgram can finish the pattern */
			/* return position of probable match */
			/* find first 0 starting from MSB, the position of that bit as counted from LSB, is the length of the
			 * longest pattern that could match */
			return j >= MAXSOPATLEN  ? j - MAXSOPATLEN : 0;
		}
	}
	/* no match */
	return -1;
}

/* ----------------------------------------------------------- */


#define MATCH_SUCCESS 0 
#define MATCH_FAILED  -1

/*
 * Call this function when an unrecoverable error has occured, (instead of exit).
 */
static void fatal_error(struct regex_matcher* matcher)
{
	regex_list_done(matcher);
	matcher->list_inited = -1;/* the phishing module will know we tried to load a whitelist, and failed, so it will disable itself too*/
}


static inline size_t get_char_at_pos_with_skip(const struct pre_fixup_info* info, const char* buffer, size_t pos)
{
	const char* str;
	size_t realpos = 0;
	if(!info) {
		return (pos <= strlen(buffer)) ? buffer[pos>0 ? pos-1:0] : '\0';
	}
	str = info->pre_displayLink.data;
	cli_dbgmsg("calc_pos_with_skip: skip:%lu, %lu - %lu \"%s\",\"%s\"\n", pos, info->host_start, info->host_end, str, buffer);
	pos += info->host_start;
	while(str[realpos] && !isalnum(str[realpos])) realpos++;
	for(; str[realpos] && (pos>0); pos--) {
		while(str[realpos]==' ') realpos++;
		realpos++;
	}
	while(str[realpos]==' ') realpos++;
	cli_dbgmsg("calc_pos_with_skip:%s\n",str+realpos);
	return (pos>0 && !str[realpos]) ? '\0' : str[realpos>0?realpos-1:0];
}

static int validate_subdomain(const struct regex_list *regex, const struct pre_fixup_info *pre_fixup, const char *buffer, size_t buffer_len, char *real_url, size_t real_len, char *orig_real_url)
{
	char c;
	const char *matched;
	size_t match_len;

	if(!regex || !regex->pattern)
		return 0;
	match_len = strlen(regex->pattern);
	if(((c=get_char_at_pos_with_skip(pre_fixup,buffer,buffer_len+1))==' ' || c=='\0' || c=='/' || c=='?') &&
			(match_len == buffer_len || /* full match */
			 (match_len < buffer_len &&
			  ((c=get_char_at_pos_with_skip(pre_fixup,buffer,buffer_len-match_len))=='.' || (c==' ')) )
			 /* subdomain matched*/)) {
		cli_dbgmsg("Got a match: %s with %s\n", buffer, regex->pattern);
		cli_dbgmsg("Before inserting .: %s\n", orig_real_url);
		if(real_len >= match_len + 1) {
			const size_t pos = real_len - match_len - 1;
			if(real_url[pos] != '.') {
				/* we need to shift left, and insert a '.'
				 * we have an extra '.' at the beginning inserted by get_host to have room,
				 * orig_real_url has to be used here, 
				 * because we want to overwrite that extra '.' */
				size_t orig_real_len = strlen(orig_real_url);
				cli_dbgmsg("No dot here:%s\n",real_url+pos);
				real_url = orig_real_url;
				memmove(real_url, real_url+1, orig_real_len-match_len-1);
				real_url[orig_real_len-match_len-1]='.';
				cli_dbgmsg("After inserting .: %s\n", real_url);
			}
		}
		return 1;
	}
	cli_dbgmsg("Ignoring false match: %s with %s, mismatched character: %c\n", buffer, regex->pattern, c);
	return 0;
}

/*
 * @matcher - matcher structure to use
 * @real_url - href target
 * @display_url - <a> tag contents
 * @hostOnly - if you want to match only the host part
 * @is_whitelist - is this a lookup in whitelist?
 *
 * @return - CL_SUCCESS - url doesn't match
 *         - CL_VIRUS - url matches list
 *
 * Do not send NULL pointers to this function!!
 *
 */
int regex_list_match(struct regex_matcher* matcher,char* real_url,const char* display_url,const struct pre_fixup_info* pre_fixup,int hostOnly,const char **info, int is_whitelist)
{
	char* orig_real_url = real_url;
	const char *vinfo;
	struct regex_list *regex;

	assert(matcher);
	assert(real_url);
	assert(display_url);
	*info = NULL;
	if(!matcher->list_inited)
		return 0;
	assert(matcher->list_built);
	/* skip initial '.' inserted by get_host */
	if(real_url[0] == '.') real_url++;
	if(display_url[0] == '.') display_url++;
	{
		size_t real_len    = strlen(real_url);
		size_t display_len = strlen(display_url);
		size_t buffer_len  = (hostOnly && !is_whitelist) ? real_len + 1 : real_len + display_len + 1 + 1;
		char *buffer = cli_malloc(buffer_len+1);
		char *bufrev;
		size_t i;
		int rc = 0;
		struct cli_ac_data mdata;

		if(!buffer)
			return CL_EMEM;

		strncpy(buffer,real_url,real_len);
		buffer[real_len]= (!is_whitelist && hostOnly) ? '/' : ':';
		if(!hostOnly || is_whitelist) {
			strncpy(buffer+real_len+1,display_url,display_len);
		}
		buffer[buffer_len - 1] = '/';
		buffer[buffer_len]=0;
		cli_dbgmsg("Looking up in regex_list: %s\n", buffer);

		if((rc = cli_ac_initdata(&mdata, 0, AC_DEFAULT_TRACKLEN)))
			return rc;

		bufrev = cli_strdup(buffer);
		if(!bufrev)
			return CL_EMEM;
		reverse_string(bufrev);
		rc = SO_search(&matcher->filter, (const unsigned char*)bufrev, buffer_len) != -1;
		if(!rc) {
			/* filter says this suffix doesn't match.
			 * The filter has false positives, but no false
			 * negatives */
			return 0;
		}

		rc = cli_ac_scanbuff((unsigned char*)bufrev,buffer_len, &vinfo, &matcher->suffixes,&mdata,0,0,-1,NULL,AC_SCAN_VIR,NULL);
		cli_ac_freedata(&mdata);

		if(rc) {
			/* TODO loop over multiple virusnames here */
			regex = (struct regex_list*)vinfo;
			do {
				/* loop over multiple regexes corresponding to
				 * this suffix */
				if (!regex->preg.re_magic) {
					/* we matched a static pattern */
					rc = validate_subdomain(regex, pre_fixup, buffer, buffer_len, real_url, real_len, orig_real_url);
				} else {
					rc = !cli_regexec(&regex->preg, buffer, 0, NULL, 0);
				}
				if(rc) *info = regex->pattern;
				regex = regex->nxt;
			 } while(!rc && regex);
		}
		free(buffer);
		if(!rc)
			cli_dbgmsg("Lookup result: not in regex list\n");
		else
			cli_dbgmsg("Lookup result: in regex list\n");
		return rc;
	}
}


/* Initialization & loading */
/* Initializes @matcher, allocating necesarry substructures */
int init_regex_list(struct regex_matcher* matcher)
{
	int rc;

	assert(matcher);
	memset(matcher, 0, sizeof(*matcher));

	matcher->list_inited=1;
	matcher->list_built=0;
	matcher->list_loaded=0;

	hashtab_init(&matcher->suffix_hash, 10);
	if((rc = cli_ac_init(&matcher->suffixes, 2, 32))) {
		return rc;
	}
	SO_init(&matcher->filter);
	return CL_SUCCESS;
}

static int functionality_level_check(char* line)
{
	char* ptmin;
	char* ptmax;
	size_t j;

	ptmin = strrchr(line,':');
	if(!ptmin) 
		return CL_SUCCESS;
	
	ptmin++;

	ptmax = strchr(ptmin,'-');
	if(!ptmax) 
		return CL_SUCCESS;/* there is no functionality level specified, so we're ok */
	else {
		size_t min, max;
		ptmax++;
		for(j=0;j+ptmin+1 < ptmax;j++)
			if(!isdigit(ptmin[j])) 
				return CL_SUCCESS;/* not numbers, not functionality level */
		for(j=0;j<strlen(ptmax);j++)
			if(!isdigit(ptmax[j])) 
				return CL_SUCCESS;/* see above */
		ptmax[-1]='\0';
		min = atoi(ptmin);
		if(strlen(ptmax)==0)
 			max = INT_MAX; 		
		else
			max = atoi(ptmax);

		if(min > cl_retflevel()) {
			cli_dbgmsg("regex list line %s not loaded (required f-level: %u)\n",line,(unsigned int)min);
			return CL_EMALFDB; 
		}

		if(max < cl_retflevel()) 
			return CL_EMALFDB;
		ptmin[-1]='\0';
		return CL_SUCCESS;
	}		
}


/* Load patterns/regexes from file */
int load_regex_matcher(struct regex_matcher* matcher,FILE* fd,unsigned int options,int is_whitelist,struct cli_dbio *dbio)
{
	int rc,line=0;
	char buffer[FILEBUFF];

	assert(matcher);

	if(matcher->list_inited==-1)
		return CL_EMALFDB; /* already failed to load */
	if(!fd && !dbio) {
		cli_errmsg("Unable to load regex list (null file)\n");
		return CL_EIO;
	}

	cli_dbgmsg("Loading regex_list\n");
	if(!matcher->list_inited) {
		rc = init_regex_list(matcher);
		if (!matcher->list_inited) {
			cli_errmsg("Regex list failed to initialize!\n");
			fatal_error(matcher);
			return rc;
		}
	}
	/*
	 * Regexlist db format (common to .wdb(whitelist) and .pdb(domainlist) files:
	 * Multiple lines of form, (empty lines are skipped):
 	 * Flags RealURL DisplayedURL
	 * Where:
	 * Flags: 
	 *
	 * .pdb files:
	 * R - regex, H - host-only, followed by (optional) 3-digit hexnumber representing 
	 * flags that should be filtered.
	 * [i.e. phishcheck urls.flags that we don't want to be done for this particular host]
	 * 
	 * .wdb files:
	 * X - full URL regex 
	 * Y - host-only regex
	 * M - host simple pattern
	 *
	 * If a line in the file doesn't conform to this format, loading fails
	 * 
	 */
	while(cli_dbgets(buffer, FILEBUFF, fd, dbio)) {
		char* pattern;
		char* flags;
		size_t pattern_len;

		cli_chomp(buffer);
		if(!*buffer)
			continue;/* skip empty lines */

		if(functionality_level_check(buffer))
			continue;

		line++;
		pattern = strchr(buffer,':');
		if(!pattern) {
			cli_errmsg("Malformed regex list line %d\n",line);
			fatal_error(matcher);
			return CL_EMALFDB;
		}
		/*pattern[0]='\0';*/
		flags = buffer+1;
		pattern++;

		pattern_len = strlen(pattern);
		if(pattern_len < FILEBUFF) {
			pattern[pattern_len] = '/';
			pattern[pattern_len+1] = '\0';
		}
		else {
			cli_errmsg("Overlong regex line %d\n",line);
			fatal_error(matcher);
			return CL_EMALFDB;
		}

		if((buffer[0] == 'R' && !is_whitelist) || ((buffer[0] == 'X' || buffer[0] == 'Y') && is_whitelist)) {
			/* regex for hostname*/
			if (( rc = add_pattern(matcher, pattern) ))
				return rc==CL_EMEM ? CL_EMEM : CL_EMALFDB;
		}
		else if( ( buffer[0] == 'H' && !is_whitelist) || (buffer[0] == 'M' && is_whitelist)) {
			/*matches displayed host*/
			if (( rc = add_static_pattern(matcher, pattern) ))
				return rc==CL_EMEM ? CL_EMEM : CL_EMALFDB;
		}
		else {
			return CL_EMALFDB;
		}
	}
	matcher->list_loaded = 1;

	return CL_SUCCESS;
}


/* Build the matcher list */
int cli_build_regex_list(struct regex_matcher* matcher)
{
	int rc;
	if(!matcher)
		return CL_SUCCESS;
	if(!matcher->list_inited || !matcher->list_loaded) {
		cli_errmsg("Regex list not loaded!\n");
		return -1;/*TODO: better error code */
	}
	cli_dbgmsg("Building regex list\n");
	hashtab_free(&matcher->suffix_hash);
	if(( rc = cli_ac_buildtrie(&matcher->suffixes) ))
		return rc;
	matcher->list_built=1;

	return CL_SUCCESS;
}

/* Done with this matcher, free resources */
void regex_list_done(struct regex_matcher* matcher)
{
	assert(matcher);

	if(matcher->list_loaded) {
		size_t i;
		/* TODO: call it, but be sure it won't free virname */
		//cli_ac_free(&matcher->suffixes);
		if(matcher->suffix_regexes) {
			for(i=0;i<matcher->suffix_cnt;i++) {
				struct regex_list *r = matcher->suffix_regexes[i];
				while(r) {
					cli_regfree(&r->preg);
					r = r->nxt;
				}
			}
			free(matcher->suffix_regexes);
			matcher->suffix_regexes = NULL;
		}
		hashtab_free(&matcher->suffix_hash);
		matcher->list_built=0;
		matcher->list_loaded=0;
	}
	if(matcher->list_inited) {
		matcher->list_inited=0;
	}
}

int is_regex_ok(struct regex_matcher* matcher)
{
	assert(matcher);
	return (!matcher->list_inited || matcher->list_inited!=-1);/* either we don't have a regexlist, or we initialized it successfully */
}

static int add_newsuffix(struct regex_matcher *matcher, struct regex_list *info, char *suffix, size_t len)
{
	struct cli_matcher *root = &matcher->suffixes;
	struct cli_ac_patt *new = cli_calloc(1,sizeof(*new));
	size_t i;
	int ret;

	if(!new)
		return CL_EMEM;
	assert(root && suffix);

	new->rtype = 0;
	new->type = 0;
	new->sigid = 0;
	new->parts = 0;
	new->partno = 0;
	new->mindist = 0;
	new->maxdist = 0;
	new->offset = 0;
	new->target = 0;
	new->length = len;

	new->ch[0] = new->ch[1] |= CLI_MATCH_IGNORE;
	if(new->length > root->maxpatlen)
		root->maxpatlen = new->length;

	new->pattern = cli_malloc(sizeof(new->pattern[0])*len);
	if(!new->pattern) {
		free(new);
		return CL_EMEM;
	}
	for(i=0;i<len;i++)
		new->pattern[i] = suffix[i];/*new->pattern is short int* */

	new->virname = (char*)info;
	if((ret = cli_ac_addpatt(root,new))) {
		free(new->pattern);
		free(new);
		return ret;
	}
	SO_preprocess_add(&matcher->filter, suffix, len);
	return CL_SUCCESS;
}

#define MODULE "regex_list: "
/* ------ load a regex, determine suffix, determine suffix2regexlist map ---- */

/* returns 0 on success, clamav error code otherwise */
static int add_pattern_suffix(struct regex_matcher *matcher, char *suffix, size_t suffix_len, struct regex_list *regex)
{
	const struct element *el;

	assert(matcher);
	el = hashtab_find(&matcher->suffix_hash, suffix, suffix_len);
	/* TODO: what if suffixes are prefixes of eachother and only one will
	 * match? */
	if(el) {
		/* existing suffix */
		assert(el->data < matcher->suffix_cnt);
		regex->nxt = matcher->suffix_regexes[el->data];
		matcher->suffix_regexes[el->data] = regex;
		cli_dbgmsg(MODULE "added new regex to existing suffix %s: %s\n", suffix, regex->pattern);
	} else {
		/* new suffix */
		size_t n = matcher->suffix_cnt++;
		el = hashtab_insert(&matcher->suffix_hash, suffix, suffix_len, n);
		matcher->suffix_regexes = cli_realloc(matcher->suffix_regexes, (n+1)*sizeof(*matcher->suffix_regexes));
		if(!matcher->suffix_regexes)
			return CL_EMEM;
		matcher->suffix_regexes[n] = regex;
		add_newsuffix(matcher, regex, suffix, suffix_len);
		cli_dbgmsg(MODULE "added new suffix %s, for regex: %s\n", suffix, regex->pattern);
	}
	return 0;
}

static size_t reverse_string(char *pattern)
{
	size_t len = strlen(pattern);
	size_t i;
	for(i=0; i < (len/2); i++) {
		char aux = pattern[i];
		pattern[i] = pattern[len-i-1];
		pattern[len-i-1] = aux;
	}
	return len;
}

static int add_static_pattern(struct regex_matcher *matcher, char* pattern)
{
	size_t len;
	struct regex_list *regex = cli_malloc(sizeof(*regex));
	if(!regex)
		return CL_EMEM;
	len = reverse_string(pattern);
	regex->nxt = NULL;
	regex->pattern = cli_strdup(pattern);
	regex->preg.re_magic = 0;
	return add_pattern_suffix(matcher, pattern, len, regex);
}

static int add_pattern(struct regex_matcher *matcher, char *pattern)
{
	struct text_buffer buf;
	struct node *n;
	size_t last=0;
	int rc;
	struct regex_list *regex = cli_malloc(sizeof(*regex));
	struct node root_node;
	size_t len;
	/* we only match the host, so remove useless stuff */
	const char remove_end[] = "([/?].*)?/";
	const char remove_end2[] = "([/?].*)/";


	if(!regex)
		return CL_EMEM;

	len = strlen(pattern);
	if(len > sizeof(remove_end)) {
		if(strncmp(&pattern[len - sizeof(remove_end)+1], remove_end, sizeof(remove_end)-1) == 0) {
			len -= sizeof(remove_end) - 1;
		}
		if(strncmp(&pattern[len - sizeof(remove_end2)+1], remove_end2, sizeof(remove_end2)-1) == 0) {
			len -= sizeof(remove_end2) - 1;
		}
	}
	pattern[len] = '\0';


	rc = cli_regcomp(&regex->preg, pattern, REG_EXTENDED);
	if(rc) {
		size_t buflen = cli_regerror(rc, &regex->preg, NULL, 0);
		char *errbuf = cli_malloc(buflen);
		if(errbuf) {
			cli_regerror(rc, &regex->preg, errbuf, buflen);
			cli_errmsg(MODULE "Error compiling regular expression %s: %s\n", pattern, errbuf);
			free(errbuf);
		} else {
			cli_errmsg(MODULE "Error compiling regular expression: %s\n", pattern);
		}
		return rc;
		cli_regfree(&regex->preg);
		free(regex);
		return CL_EMALFDB;
	}
	regex->pattern = cli_strdup(pattern);
	regex->nxt = NULL;

	n = parse_regex(pattern, &last);
	memset(&buf, 0, sizeof(buf));
	memset(&root_node, 0, sizeof(buf));
	n->parent = &root_node;

	rc = build_suffixtree_descend(matcher, regex, n, &buf);
	destroy_tree(n);
	return rc;
}