2006-09-12 19:38:39 +00:00
/*
* Match a string against a list of patterns / regexes .
*
2007-03-11 11:14:35 +00:00
* Copyright ( C ) 2006 - 2007 T <EFBFBD> r <EFBFBD> k Edvin < edwin @ clamav . net >
2006-09-12 19:38:39 +00:00
*
* This program is free software ; you can redistribute it and / or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation ; either version 2 of the License , or
* ( at your option ) any later version .
*
* This program is distributed in the hope that it will be useful ,
* but WITHOUT ANY WARRANTY ; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE . See the
* GNU General Public License for more details .
*
* You should have received a copy of the GNU General Public License
* along with this program ; if not , write to the Free Software
* Foundation , Inc . , 51 Franklin Street , Fifth Floor , Boston ,
* MA 02110 - 1301 , USA .
*
*/
# if HAVE_CONFIG_H
# include "clamav-config.h"
# endif
# ifndef CL_DEBUG
# define NDEBUG
# endif
# ifdef CL_THREAD_SAFE
# ifndef _REENTRANT
# define _REENTRANT
# endif
# endif
2007-06-09 16:09:45 +00:00
/* TODO: when implementation of new version is complete, enable it in CL_EXPERIMENTAL */
# ifdef CL_EXPERIMENTAL
//#define USE_NEW_VERSION
# endif
# ifndef USE_NEW_VERSION
/*this is the old version of regex_list.c
* reason for redesign : there is only one node type that has to handle all the cases : binary search among children , alternatives list , match .
* This design is very error - prone . */
2006-09-12 19:38:39 +00:00
# include <stdio.h>
# include <stdlib.h>
# include <string.h>
# include <ctype.h>
# include <limits.h>
# include <sys/types.h>
2006-09-26 18:55:36 +00:00
# ifdef HAVE_REGEX_H
2006-09-12 19:38:39 +00:00
# include <regex.h>
2006-09-26 18:55:36 +00:00
# endif
2006-09-12 19:38:39 +00:00
# include "clamav.h"
# include "others.h"
# include "regex_list.h"
# include "matcher-ac.h"
2007-03-11 11:14:35 +00:00
# include "str.h"
2006-09-12 19:38:39 +00:00
/*Tree*/
enum token_op_t { OP_CHAR , OP_STDCLASS , OP_CUSTOMCLASS , OP_DOT , OP_LEAF , OP_ROOT , OP_PARCLOSE } ;
2006-10-10 23:51:49 +00:00
typedef unsigned char * char_bitmap_p ;
2006-09-12 19:38:39 +00:00
/*
*
* OP_CHAR : 1 character , c = character
* complex stuff :
* OP_STDCLASS : standard character class , c = char class , class : 1 < < ( index into std_class of class name )
* OP_CUSTOMCLASS : custom character class , first pointer in ptr array is a pointer to the bitmap table for this class
* OP_DOT : single . matching any character except \ n
* OP_LEAF : this is a leaf node , reinterpret structure
*/
struct tree_node {
2006-10-10 23:51:49 +00:00
struct tree_node * next ; /* next regex/complex sibling, or parent, if no more siblings , can't be NULL except for root node*/
2006-09-12 19:38:39 +00:00
unsigned char c ;
2006-10-10 23:51:49 +00:00
enum token_op_t op ;
2006-09-12 19:38:39 +00:00
char alternatives ; /* number of (non-regex) children of node, i.e. sizeof(children)*/
char listend ; /* no more siblings, next pointer is pointer to parent*/
union {
struct tree_node * * children ; /* alternatives nr. of children, followed by (a null pointer terminated) regex leaf node pointers) */
char_bitmap_p * bitmap ;
struct leaf_info * leaf ;
} u ;
} ;
struct leaf_info {
char * info ; /* what does it mean that we reached the leaf...*/
regex_t * preg ; /* this is NULL if leaf node, and non-regex*/
} ;
/* Character classes */
2006-10-10 23:51:49 +00:00
static const char * std_class [ ] = {
" [:alnum:] " ,
" [:digit:] " ,
" [:punct:] " ,
" [:alpha:] " ,
" [:graph:] " ,
" [:space:] " ,
" [:blank:] " ,
" [:lower:] " ,
" [:upper:] " ,
" [:cntrl:] " ,
" [:print:] " ,
" [:xdigit:] "
/* don't change the order of these strings, unless you change them in generate_tables.c too, and regenerate the tables*/
2006-09-12 19:38:39 +00:00
} ;
2006-10-07 11:00:46 +00:00
2006-09-12 19:38:39 +00:00
# define STD_CLASS_CNT sizeof(std_class) / sizeof(std_class[0])
2006-10-07 11:00:46 +00:00
2006-10-10 23:51:49 +00:00
/* generated by contrib/phishing/generate_tables.c */
static const unsigned char char_class_bitmap [ STD_CLASS_CNT ] [ 32 ] = {
{ 0x00 , 0x00 , 0x00 , 0x00 , 0x00 , 0x00 , 0xff , 0x03 ,
0xfe , 0xff , 0xff , 0x07 , 0xfe , 0xff , 0xff , 0x07 ,
0x00 , 0x00 , 0x00 , 0x00 , 0x00 , 0x00 , 0x00 , 0x00 ,
0x00 , 0x00 , 0x00 , 0x00 , 0x00 , 0x00 , 0x00 , 0x00 } ,
{ 0x00 , 0x00 , 0x00 , 0x00 , 0x00 , 0x00 , 0xff , 0x03 ,
0x00 , 0x00 , 0x00 , 0x00 , 0x00 , 0x00 , 0x00 , 0x00 ,
0x00 , 0x00 , 0x00 , 0x00 , 0x00 , 0x00 , 0x00 , 0x00 ,
0x00 , 0x00 , 0x00 , 0x00 , 0x00 , 0x00 , 0x00 , 0x00 } ,
{ 0x00 , 0x00 , 0x00 , 0x00 , 0xfe , 0xff , 0x00 , 0xfc ,
0x01 , 0x00 , 0x00 , 0xf8 , 0x01 , 0x00 , 0x00 , 0x78 ,
0x00 , 0x00 , 0x00 , 0x00 , 0x00 , 0x00 , 0x00 , 0x00 ,
0x00 , 0x00 , 0x00 , 0x00 , 0x00 , 0x00 , 0x00 , 0x00 } ,
{ 0x00 , 0x00 , 0x00 , 0x00 , 0x00 , 0x00 , 0x00 , 0x00 ,
0xfe , 0xff , 0xff , 0x07 , 0xfe , 0xff , 0xff , 0x07 ,
0x00 , 0x00 , 0x00 , 0x00 , 0x00 , 0x00 , 0x00 , 0x00 ,
0x00 , 0x00 , 0x00 , 0x00 , 0x00 , 0x00 , 0x00 , 0x00 } ,
{ 0x00 , 0x00 , 0x00 , 0x00 , 0xfe , 0xff , 0xff , 0xff ,
0xff , 0xff , 0xff , 0xff , 0xff , 0xff , 0xff , 0x7f ,
0x00 , 0x00 , 0x00 , 0x00 , 0x00 , 0x00 , 0x00 , 0x00 ,
0x00 , 0x00 , 0x00 , 0x00 , 0x00 , 0x00 , 0x00 , 0x00 } ,
{ 0x00 , 0x3e , 0x00 , 0x00 , 0x01 , 0x00 , 0x00 , 0x00 ,
0x00 , 0x00 , 0x00 , 0x00 , 0x00 , 0x00 , 0x00 , 0x00 ,
0x00 , 0x00 , 0x00 , 0x00 , 0x00 , 0x00 , 0x00 , 0x00 ,
0x00 , 0x00 , 0x00 , 0x00 , 0x00 , 0x00 , 0x00 , 0x00 } ,
{ 0x00 , 0x02 , 0x00 , 0x00 , 0x01 , 0x00 , 0x00 , 0x00 ,
0x00 , 0x00 , 0x00 , 0x00 , 0x00 , 0x00 , 0x00 , 0x00 ,
0x00 , 0x00 , 0x00 , 0x00 , 0x00 , 0x00 , 0x00 , 0x00 ,
0x00 , 0x00 , 0x00 , 0x00 , 0x00 , 0x00 , 0x00 , 0x00 } ,
{ 0x00 , 0x00 , 0x00 , 0x00 , 0x00 , 0x00 , 0x00 , 0x00 ,
0x00 , 0x00 , 0x00 , 0x00 , 0xfe , 0xff , 0xff , 0x07 ,
0x00 , 0x00 , 0x00 , 0x00 , 0x00 , 0x00 , 0x00 , 0x00 ,
0x00 , 0x00 , 0x00 , 0x00 , 0x00 , 0x00 , 0x00 , 0x00 } ,
{ 0x00 , 0x00 , 0x00 , 0x00 , 0x00 , 0x00 , 0x00 , 0x00 ,
0xfe , 0xff , 0xff , 0x07 , 0x00 , 0x00 , 0x00 , 0x00 ,
0x00 , 0x00 , 0x00 , 0x00 , 0x00 , 0x00 , 0x00 , 0x00 ,
0x00 , 0x00 , 0x00 , 0x00 , 0x00 , 0x00 , 0x00 , 0x00 } ,
{ 0xff , 0xff , 0xff , 0xff , 0x00 , 0x00 , 0x00 , 0x00 ,
0x00 , 0x00 , 0x00 , 0x00 , 0x00 , 0x00 , 0x00 , 0x80 ,
0x00 , 0x00 , 0x00 , 0x00 , 0x00 , 0x00 , 0x00 , 0x00 ,
0x00 , 0x00 , 0x00 , 0x00 , 0x00 , 0x00 , 0x00 , 0x00 } ,
{ 0x00 , 0x00 , 0x00 , 0x00 , 0xff , 0xff , 0xff , 0xff ,
0xff , 0xff , 0xff , 0xff , 0xff , 0xff , 0xff , 0x7f ,
0x00 , 0x00 , 0x00 , 0x00 , 0x00 , 0x00 , 0x00 , 0x00 ,
0x00 , 0x00 , 0x00 , 0x00 , 0x00 , 0x00 , 0x00 , 0x00 } ,
{ 0x00 , 0x00 , 0x00 , 0x00 , 0x00 , 0x00 , 0xff , 0x03 ,
0x7e , 0x00 , 0x00 , 0x00 , 0x7e , 0x00 , 0x00 , 0x00 ,
0x00 , 0x00 , 0x00 , 0x00 , 0x00 , 0x00 , 0x00 , 0x00 ,
0x00 , 0x00 , 0x00 , 0x00 , 0x00 , 0x00 , 0x00 , 0x00 }
} ;
2006-10-07 11:00:46 +00:00
2006-10-10 23:51:49 +00:00
static const unsigned short int char_class [ 256 ] = {
0x200 , 0x200 , 0x200 , 0x200 , 0x200 , 0x200 , 0x200 , 0x200 , 0x200 , 0x260 , 0x220 , 0x220 , 0x220 , 0x220 , 0x200 , 0x200 ,
0x200 , 0x200 , 0x200 , 0x200 , 0x200 , 0x200 , 0x200 , 0x200 , 0x200 , 0x200 , 0x200 , 0x200 , 0x200 , 0x200 , 0x200 , 0x200 ,
0x460 , 0x414 , 0x414 , 0x414 , 0x414 , 0x414 , 0x414 , 0x414 , 0x414 , 0x414 , 0x414 , 0x414 , 0x414 , 0x414 , 0x414 , 0x414 ,
0xc13 , 0xc13 , 0xc13 , 0xc13 , 0xc13 , 0xc13 , 0xc13 , 0xc13 , 0xc13 , 0xc13 , 0x414 , 0x414 , 0x414 , 0x414 , 0x414 , 0x414 ,
0x414 , 0xd19 , 0xd19 , 0xd19 , 0xd19 , 0xd19 , 0xd19 , 0x519 , 0x519 , 0x519 , 0x519 , 0x519 , 0x519 , 0x519 , 0x519 , 0x519 ,
0x519 , 0x519 , 0x519 , 0x519 , 0x519 , 0x519 , 0x519 , 0x519 , 0x519 , 0x519 , 0x519 , 0x414 , 0x414 , 0x414 , 0x414 , 0x414 ,
0x414 , 0xc99 , 0xc99 , 0xc99 , 0xc99 , 0xc99 , 0xc99 , 0x499 , 0x499 , 0x499 , 0x499 , 0x499 , 0x499 , 0x499 , 0x499 , 0x499 ,
0x499 , 0x499 , 0x499 , 0x499 , 0x499 , 0x499 , 0x499 , 0x499 , 0x499 , 0x499 , 0x499 , 0x414 , 0x414 , 0x414 , 0x414 , 0x200 ,
0x000 , 0x000 , 0x000 , 0x000 , 0x000 , 0x000 , 0x000 , 0x000 , 0x000 , 0x000 , 0x000 , 0x000 , 0x000 , 0x000 , 0x000 , 0x000 ,
0x000 , 0x000 , 0x000 , 0x000 , 0x000 , 0x000 , 0x000 , 0x000 , 0x000 , 0x000 , 0x000 , 0x000 , 0x000 , 0x000 , 0x000 , 0x000 ,
0x000 , 0x000 , 0x000 , 0x000 , 0x000 , 0x000 , 0x000 , 0x000 , 0x000 , 0x000 , 0x000 , 0x000 , 0x000 , 0x000 , 0x000 , 0x000 ,
0x000 , 0x000 , 0x000 , 0x000 , 0x000 , 0x000 , 0x000 , 0x000 , 0x000 , 0x000 , 0x000 , 0x000 , 0x000 , 0x000 , 0x000 , 0x000 ,
0x000 , 0x000 , 0x000 , 0x000 , 0x000 , 0x000 , 0x000 , 0x000 , 0x000 , 0x000 , 0x000 , 0x000 , 0x000 , 0x000 , 0x000 , 0x000 ,
0x000 , 0x000 , 0x000 , 0x000 , 0x000 , 0x000 , 0x000 , 0x000 , 0x000 , 0x000 , 0x000 , 0x000 , 0x000 , 0x000 , 0x000 , 0x000 ,
0x000 , 0x000 , 0x000 , 0x000 , 0x000 , 0x000 , 0x000 , 0x000 , 0x000 , 0x000 , 0x000 , 0x000 , 0x000 , 0x000 , 0x000 , 0x000 ,
0x000 , 0x000 , 0x000 , 0x000 , 0x000 , 0x000 , 0x000 , 0x000 , 0x000 , 0x000 , 0x000 , 0x000 , 0x000 , 0x000 , 0x000 , 0x000
} ;
2006-09-12 19:38:39 +00:00
2006-10-10 23:51:49 +00:00
static const size_t std_class_cnt = sizeof ( std_class ) / sizeof ( std_class [ 0 ] ) ;
2006-10-07 11:00:46 +00:00
2006-09-12 19:38:39 +00:00
/* Prototypes */
2007-03-18 23:27:15 +00:00
static int add_pattern ( struct regex_matcher * matcher , const unsigned char * pat , const char * info , int hostOnly ) ;
2006-09-12 19:38:39 +00:00
static int match_node ( struct tree_node * node , const unsigned char * c , size_t len , const char * * info ) ;
static void destroy_tree ( struct regex_matcher * matcher ) ;
2006-10-10 23:51:49 +00:00
static struct tree_node * tree_root_alloc ( void ) ;
static int build_regex_list ( struct regex_matcher * matcher ) ;
static void stack_destroy ( struct node_stack * stack ) ;
2006-09-12 19:38:39 +00:00
2006-10-10 23:51:49 +00:00
# ifndef NDEBUG
void dump_tree ( struct tree_node * root ) ;
# endif
2006-09-12 19:38:39 +00:00
# define MATCH_SUCCESS 0
# define MATCH_FAILED -1
/*
* Call this function when an unrecoverable error has occured , ( instead of exit ) .
*/
static void fatal_error ( struct regex_matcher * matcher )
{
regex_list_done ( matcher ) ;
matcher - > list_inited = - 1 ; /* the phishing module will know we tried to load a whitelist, and failed, so it will disable itself too*/
}
/*
* @ matcher - matcher structure to use
* @ real_url - href target
* @ display_url - < a > tag contents
* @ hostOnly - if you want to match only the host part
2006-10-10 23:51:49 +00:00
* @ is_whitelist - is this a lookup in whitelist ?
2006-09-12 19:38:39 +00:00
*
* @ return - CL_SUCCESS - url doesn ' t match
* - CL_VIRUS - url matches list
*
* Do not send NULL pointers to this function ! !
*
*/
2006-10-10 23:51:49 +00:00
int regex_list_match ( struct regex_matcher * matcher , const char * real_url , const char * display_url , int hostOnly , const char * * info , int is_whitelist )
2006-09-12 19:38:39 +00:00
{
2006-10-10 23:51:49 +00:00
massert ( matcher ) ;
massert ( real_url ) ;
massert ( display_url ) ;
massert ( info ) ;
2006-09-12 19:38:39 +00:00
if ( ! matcher - > list_inited )
return 0 ;
2006-10-10 23:51:49 +00:00
massert ( matcher - > list_built ) ;
2006-09-12 19:38:39 +00:00
{
size_t real_len = strlen ( real_url ) ;
size_t display_len = strlen ( display_url ) ;
2006-12-19 20:47:45 +00:00
size_t buffer_len = ( hostOnly & & ! is_whitelist ) ? real_len : real_len + display_len + 1 + ( is_whitelist ? 1 : 0 ) ;
2006-09-12 19:38:39 +00:00
char * buffer = cli_malloc ( buffer_len + 1 ) ;
2006-10-15 19:16:33 +00:00
size_t i ;
2006-11-15 15:26:54 +00:00
int rc = 0 ;
struct cli_ac_data mdata ;
2006-09-12 19:38:39 +00:00
if ( ! buffer )
return CL_EMEM ;
strncpy ( buffer , real_url , real_len ) ;
2006-10-28 15:34:40 +00:00
buffer [ real_len ] = ( ! is_whitelist & & hostOnly ) ? ' \0 ' : ' : ' ;
2006-10-10 23:51:49 +00:00
if ( ! hostOnly | | is_whitelist ) {
2006-09-27 19:16:11 +00:00
strncpy ( buffer + real_len + 1 , display_url , display_len ) ;
2006-12-19 20:47:45 +00:00
if ( is_whitelist )
buffer [ buffer_len - 1 ] = ' / ' ;
2006-09-27 19:16:11 +00:00
buffer [ buffer_len ] = 0 ;
2006-09-16 15:49:27 +00:00
}
2006-09-27 19:16:11 +00:00
cli_dbgmsg ( " Looking up in regex_list: %s \n " , buffer ) ;
2006-09-12 19:38:39 +00:00
2006-11-15 15:26:54 +00:00
if ( hostOnly ) {
if ( ( rc = cli_ac_initdata ( & mdata , 0 , AC_DEFAULT_TRACKLEN ) ) )
return rc ;
rc = 0 ;
2006-10-15 19:16:33 +00:00
for ( i = 0 ; i < matcher - > root_hosts_cnt ; i + + ) {
2007-06-16 16:36:22 +00:00
/* needs to match terminating \0 too */
rc = cli_ac_scanbuff ( ( unsigned char * ) buffer , buffer_len + 1 , info , & matcher - > root_hosts [ i ] , & mdata , 0 , 0 , 0 , - 1 , NULL ) ;
cli_ac_freedata ( & mdata ) ;
2007-06-26 07:40:02 +00:00
if ( rc ) {
const char * matched = strchr ( * info , ' : ' ) ;
const size_t match_len = matched ? strlen ( matched + 1 ) : 0 ;
if ( match_len = = buffer_len | | /* full match */
( match_len < buffer_len & &
buffer [ buffer_len - match_len - 1 ] = = ' . ' )
/* subdomain matched*/ ) {
cli_dbgmsg ( " Got a match:%s with %s \n " , buffer , * info ) ;
break ;
}
cli_dbgmsg ( " Ignoring false match:%s with %s \n " , buffer , * info ) ;
rc = 0 ;
}
2006-10-15 19:16:33 +00:00
}
2006-11-15 15:26:54 +00:00
} else
2006-10-15 19:16:33 +00:00
rc = 0 ;
2006-11-15 15:26:54 +00:00
2007-03-18 23:27:15 +00:00
if ( ! rc )
rc = match_node ( hostOnly ? matcher - > root_regex_hostonly : matcher - > root_regex , ( unsigned char * ) buffer , buffer_len , info ) = = MATCH_SUCCESS ? CL_VIRUS : CL_SUCCESS ;
2006-09-12 19:38:39 +00:00
free ( buffer ) ;
if ( ! rc )
2007-04-28 20:15:22 +00:00
cli_dbgmsg ( " Lookup result: not in regex list \n " ) ;
else
cli_dbgmsg ( " Lookup result: in regex list \n " ) ;
2006-09-12 19:38:39 +00:00
return rc ;
}
}
/* node stack */
# define NODE_STACK_INITIAL 1024
# define NODE_STACK_GROW 4096
/* Initialize @stack */
static int stack_init ( struct node_stack * stack )
{
2006-10-10 23:51:49 +00:00
massert ( stack ) ;
2006-09-12 19:38:39 +00:00
stack - > cnt = 0 ;
stack - > capacity = NODE_STACK_INITIAL ;
stack - > data = cli_malloc ( stack - > capacity * sizeof ( * stack - > data ) ) ;
if ( ! stack - > data )
return CL_EMEM ;
else
return CL_SUCCESS ;
}
/* Reset @stack pointer, but don't realloc */
static void stack_reset ( struct node_stack * stack )
{
2006-10-10 23:51:49 +00:00
massert ( stack ) ;
2006-09-12 19:38:39 +00:00
stack - > cnt = 0 ;
}
/* Push @node on @stack, growing it if necessarry */
2007-03-11 11:14:35 +00:00
static int stack_push ( struct node_stack * stack , struct tree_node * node )
2006-09-12 19:38:39 +00:00
{
2006-10-10 23:51:49 +00:00
massert ( stack ) ;
massert ( stack - > data ) ;
2006-09-12 19:38:39 +00:00
if ( stack - > cnt = = stack - > capacity ) {
stack - > capacity + = NODE_STACK_GROW ;
2007-05-25 23:10:58 +00:00
stack - > data = cli_realloc2 ( stack - > data , stack - > capacity * sizeof ( * stack - > data ) ) ;
2006-09-12 19:38:39 +00:00
if ( ! stack - > data )
return CL_EMEM ;
}
stack - > data [ stack - > cnt + + ] = node ;
return CL_SUCCESS ;
}
/* Pops node from @stack, doesn't realloc */
2007-03-11 11:14:35 +00:00
static struct tree_node * stack_pop ( struct node_stack * stack )
2006-09-12 19:38:39 +00:00
{
2006-10-10 23:51:49 +00:00
massert ( stack ) ;
massert ( stack - > data ) ;
massert ( stack - > cnt ) ; /*don't pop from empty stack */
2006-09-12 19:38:39 +00:00
return stack - > cnt ? stack - > data [ - - stack - > cnt ] : NULL ;
}
/* Initialization & loading */
/* Initializes @matcher, allocating necesarry substructures */
int init_regex_list ( struct regex_matcher * matcher )
{
2006-10-15 19:16:33 +00:00
int rc ;
2006-10-10 23:51:49 +00:00
massert ( matcher ) ;
2006-09-12 19:38:39 +00:00
matcher - > list_inited = 0 ;
2006-10-15 19:16:33 +00:00
matcher - > root_hosts_cnt = 0 ;
matcher - > root_hosts = NULL ;
matcher - > root_hosts_cnt = 0 ;
2006-09-12 19:38:39 +00:00
matcher - > root_regex = tree_root_alloc ( ) ;
if ( ! matcher - > root_regex ) {
return CL_EMEM ;
}
2007-03-18 23:27:15 +00:00
matcher - > root_regex_hostonly = tree_root_alloc ( ) ;
if ( ! matcher - > root_regex_hostonly ) {
free ( matcher - > root_regex ) ;
return CL_EMEM ;
}
2006-10-15 19:16:33 +00:00
if ( ( rc = stack_init ( & matcher - > node_stack ) ) ) {
2007-03-18 23:27:15 +00:00
free ( matcher - > root_regex_hostonly ) ;
2006-10-15 19:16:33 +00:00
free ( matcher - > root_regex ) ;
return rc ;
}
if ( ( rc = stack_init ( & matcher - > node_stack_alt ) ) ) {
2007-03-18 23:27:15 +00:00
free ( matcher - > root_regex_hostonly ) ;
2006-10-15 19:16:33 +00:00
free ( matcher - > root_regex ) ;
stack_destroy ( & matcher - > node_stack ) ;
return rc ;
}
2006-09-12 19:38:39 +00:00
matcher - > list_inited = 1 ;
2006-10-15 19:16:33 +00:00
matcher - > list_built = 1 ; /* its empty, but pretend its built, so that load_ will realloc root_hosts */
2006-09-12 19:38:39 +00:00
matcher - > list_loaded = 0 ;
return CL_SUCCESS ;
}
/* inserts @pattern into @root, using ac-matcher
* although the name might be confusing , @ pattern is not a regex ! */
static int add_regex_list_element ( struct cli_matcher * root , const char * pattern , char * info )
{
2006-10-10 23:51:49 +00:00
int ret ;
2006-09-12 19:38:39 +00:00
struct cli_ac_patt * new = cli_calloc ( 1 , sizeof ( * new ) ) ;
2006-10-10 23:51:49 +00:00
size_t len , i ;
2006-09-12 19:38:39 +00:00
if ( ! new )
return CL_EMEM ;
2006-10-10 23:51:49 +00:00
massert ( root ) ;
massert ( pattern ) ;
2006-09-12 19:38:39 +00:00
2007-06-16 16:36:22 +00:00
len = strlen ( pattern ) + 1 ;
/* need to match \0 too, so we are sure
* matches only happen at end of string */
2006-09-12 19:38:39 +00:00
new - > type = 0 ;
new - > sigid = 0 ;
new - > parts = 0 ;
new - > partno = 0 ;
new - > mindist = 0 ;
new - > maxdist = 0 ;
new - > offset = 0 ;
new - > target = 0 ;
new - > length = len ;
if ( new - > length > root - > maxpatlen )
root - > maxpatlen = new - > length ;
new - > pattern = cli_malloc ( sizeof ( new - > pattern [ 0 ] ) * len ) ;
if ( ! new - > pattern ) {
free ( new ) ;
return CL_EMEM ;
}
2006-09-16 15:49:27 +00:00
for ( i = 0 ; i < len ; i + + )
new - > pattern [ i ] = pattern [ i ] ; /*new->pattern is short int* */
2006-09-12 19:38:39 +00:00
2007-06-26 07:40:02 +00:00
2007-03-11 11:14:35 +00:00
new - > virname = cli_strdup ( info ) ;
2006-09-12 19:38:39 +00:00
if ( ( ret = cli_ac_addpatt ( root , new ) ) ) {
free ( new - > virname ) ;
free ( new - > pattern ) ;
free ( new ) ;
return ret ;
}
return CL_SUCCESS ;
}
2006-12-19 20:30:17 +00:00
static int functionality_level_check ( char * line )
2006-12-02 00:42:44 +00:00
{
char * ptmin ;
char * ptmax ;
size_t j ;
ptmin = strrchr ( line , ' : ' ) ;
if ( ! ptmin )
return CL_SUCCESS ;
ptmin + + ;
ptmax = strchr ( ptmin , ' - ' ) ;
if ( ! ptmax )
return CL_SUCCESS ; /* there is no functionality level specified, so we're ok */
else {
2006-12-19 20:30:17 +00:00
size_t min , max ;
2006-12-02 00:42:44 +00:00
ptmax + + ;
2006-12-19 20:30:17 +00:00
for ( j = 0 ; j + ptmin + 1 < ptmax ; j + + )
2006-12-02 00:42:44 +00:00
if ( ! isdigit ( ptmin [ j ] ) )
return CL_SUCCESS ; /* not numbers, not functionality level */
for ( j = 0 ; j < strlen ( ptmax ) ; j + + )
if ( ! isdigit ( ptmax [ j ] ) )
return CL_SUCCESS ; /* see above */
ptmax [ - 1 ] = ' \0 ' ;
min = atoi ( ptmin ) ;
if ( strlen ( ptmax ) = = 0 )
max = INT_MAX ;
else
max = atoi ( ptmax ) ;
if ( min > cl_retflevel ( ) ) {
2007-03-11 11:14:35 +00:00
cli_dbgmsg ( " regex list line %s not loaded (required f-level: %u) \n " , line , ( unsigned int ) min ) ;
2006-12-02 00:42:44 +00:00
return CL_EMALFDB ;
}
if ( max < cl_retflevel ( ) )
return CL_EMALFDB ;
ptmin [ - 1 ] = ' \0 ' ;
return CL_SUCCESS ;
}
}
2006-09-12 19:38:39 +00:00
/* Load patterns/regexes from file */
2006-10-10 23:51:49 +00:00
int load_regex_matcher ( struct regex_matcher * matcher , FILE * fd , unsigned int options , int is_whitelist )
2006-09-12 19:38:39 +00:00
{
int rc , line = 0 ;
char buffer [ FILEBUFF ] ;
2006-10-10 23:51:49 +00:00
massert ( matcher ) ;
massert ( fd ) ;
2006-09-12 19:38:39 +00:00
if ( matcher - > list_inited = = - 1 )
2006-10-10 23:51:49 +00:00
return CL_EMALFDB ; /* already failed to load */
2006-10-15 19:16:33 +00:00
/* if(matcher->list_loaded) {
2006-09-12 19:38:39 +00:00
cli_warnmsg ( " Regex list has already been loaded, ignoring further requests for load \n " ) ;
2006-10-10 23:51:49 +00:00
return CL_SUCCESS ;
2006-10-15 19:16:33 +00:00
} */
2006-09-12 19:38:39 +00:00
if ( ! fd ) {
cli_errmsg ( " Unable to load regex list (null file) \n " ) ;
2006-10-10 23:51:49 +00:00
return CL_EIO ;
2006-09-12 19:38:39 +00:00
}
cli_dbgmsg ( " Loading regex_list \n " ) ;
if ( ! matcher - > list_inited ) {
2006-10-10 23:51:49 +00:00
rc = init_regex_list ( matcher ) ;
2006-09-12 19:38:39 +00:00
if ( ! matcher - > list_inited ) {
cli_errmsg ( " Regex list failed to initialize! \n " ) ;
fatal_error ( matcher ) ;
2006-10-10 23:51:49 +00:00
return rc ;
2006-09-12 19:38:39 +00:00
}
/*atexit(regex_list_done); TODO: destroy this in manager.c */
}
/*
* Regexlist db format ( common to . wdb ( whitelist ) and . pdb ( domainlist ) files :
* Multiple lines of form , ( empty lines are skipped ) :
* Flags RealURL DisplayedURL
* Where :
2007-03-18 23:27:15 +00:00
* Flags :
*
* . pdb files :
* R - regex , H - host - only , followed by ( optional ) 3 - digit hexnumber representing
2006-09-12 19:38:39 +00:00
* flags that should be filtered .
* [ i . e . phishcheck urls . flags that we don ' t want to be done for this particular host ]
2007-03-18 23:27:15 +00:00
*
* . wdb files :
* X - full URL regex
* Y - host - only regex
* M - host simple pattern
2006-09-12 19:38:39 +00:00
*
* If a line in the file doesn ' t conform to this format , loading fails
*
*/
while ( fgets ( buffer , FILEBUFF , fd ) ) {
char * pattern ;
char * flags ;
cli_chomp ( buffer ) ;
if ( ! * buffer )
continue ; /* skip empty lines */
2006-12-02 00:42:44 +00:00
if ( functionality_level_check ( buffer ) )
continue ;
line + + ;
2006-10-28 15:34:40 +00:00
pattern = strchr ( buffer , ' : ' ) ;
2006-09-12 19:38:39 +00:00
if ( ! pattern ) {
cli_errmsg ( " Malformed regex list line %d \n " , line ) ;
fatal_error ( matcher ) ;
return CL_EMALFDB ;
}
2007-06-26 07:40:02 +00:00
/*pattern[0]='\0';*/
2006-10-10 23:51:49 +00:00
flags = buffer + 1 ;
2006-09-12 19:38:39 +00:00
pattern + + ;
2006-12-19 20:47:45 +00:00
if ( is_whitelist ) {
const size_t pattern_len = strlen ( pattern ) ;
if ( pattern_len < FILEBUFF ) {
pattern [ pattern_len ] = ' / ' ;
pattern [ pattern_len + 1 ] = ' \0 ' ;
}
else {
cli_errmsg ( " Overlong regex line %d \n " , line ) ;
fatal_error ( matcher ) ;
return CL_EMALFDB ;
}
}
2007-03-18 23:27:15 +00:00
if ( ( buffer [ 0 ] = = ' R ' & & ! is_whitelist ) | | ( ( buffer [ 0 ] = = ' X ' | | buffer [ 0 ] = = ' Y ' ) & & is_whitelist ) ) { /*regex*/
if ( ( rc = add_pattern ( matcher , ( const unsigned char * ) pattern , flags , buffer [ 0 ] = = ' Y ' ) ) )
2006-09-12 19:38:39 +00:00
return rc = = CL_EMEM ? CL_EMEM : CL_EMALFDB ;
}
2006-10-15 19:16:33 +00:00
else if ( ( buffer [ 0 ] = = ' H ' & & ! is_whitelist ) | | ( buffer [ 0 ] = = ' M ' & & is_whitelist ) ) { /*matches displayed host*/
2007-04-28 20:15:22 +00:00
struct cli_matcher * root ;
2006-10-15 19:16:33 +00:00
if ( matcher - > list_built ) {
struct cli_matcher * old_hosts = matcher - > root_hosts ;
matcher - > root_hosts_cnt + + ;
2007-05-26 10:22:18 +00:00
matcher - > root_hosts = cli_realloc ( matcher - > root_hosts , matcher - > root_hosts_cnt * sizeof ( * matcher - > root_hosts ) ) ;
2006-10-15 19:16:33 +00:00
if ( ! matcher - > root_hosts ) {
matcher - > root_hosts = old_hosts ; /* according to manpage this must still be valid*/
return CL_EMEM ;
2007-03-11 11:14:35 +00:00
}
2007-04-28 20:15:22 +00:00
root = & matcher - > root_hosts [ matcher - > root_hosts_cnt - 1 ] ;
memset ( root , 0 , sizeof ( struct cli_matcher ) ) ;
cli_dbgmsg ( " regex_list: Initialising AC pattern matcher \n " ) ;
if ( ( rc = cli_ac_init ( root , AC_DEFAULT_MIN_DEPTH , AC_DEFAULT_MAX_DEPTH ) ) ) {
/* no need to free previously allocated memory here */
cli_errmsg ( " regex_list: Can't initialise AC pattern matcher \n " ) ;
return rc ;
}
2006-10-15 19:16:33 +00:00
matcher - > list_built = 0 ;
}
2007-04-28 20:15:22 +00:00
else {
root = & matcher - > root_hosts [ matcher - > root_hosts_cnt - 1 ] ;
}
if ( ( rc = add_regex_list_element ( root , pattern , flags ) ) )
2006-09-12 19:38:39 +00:00
return rc = = CL_EMEM ? CL_EMEM : CL_EMALFDB ;
}
else {
2006-10-10 23:51:49 +00:00
return CL_EMALFDB ;
/* this is useless, we have host, and regex matches
2006-09-12 19:38:39 +00:00
if ( ( rc = add_regex_list_element ( matcher - > root_urls , pattern , flags ) ) )
2006-10-10 23:51:49 +00:00
return rc = = CL_EMEM ? CL_EMEM : CL_EMALFDB ; */
2006-09-12 19:38:39 +00:00
}
}
matcher - > list_loaded = 1 ;
2006-10-15 19:16:33 +00:00
if ( ( rc = build_regex_list ( matcher ) ) )
return rc ;
2006-09-12 19:38:39 +00:00
# ifndef NDEBUG
/* dump_tree(matcher->root_regex);*/
# endif
if ( ! matcher - > list_built ) {
cli_errmsg ( " Regex list not loaded: build failed! \n " ) ;
fatal_error ( matcher ) ;
return CL_EMALFDB ;
}
regex_list_cleanup ( matcher ) ;
return CL_SUCCESS ;
}
static struct tree_node * * tree_node_get_children ( const struct tree_node * node )
{
return node - > op = = OP_CUSTOMCLASS ? ( node - > u . children [ 1 ] ? node - > u . children + 1 : NULL ) : node - > u . children ;
}
2006-10-10 23:51:49 +00:00
2006-09-12 19:38:39 +00:00
/* Build the matcher list */
static int build_regex_list ( struct regex_matcher * matcher )
{
2006-10-15 19:16:33 +00:00
int rc ;
2006-09-12 19:38:39 +00:00
if ( ! matcher - > list_inited | | ! matcher - > list_loaded ) {
cli_errmsg ( " Regex list not loaded! \n " ) ;
return - 1 ; /*TODO: better error code */
}
cli_dbgmsg ( " Building regex list \n " ) ;
2006-10-30 17:53:03 +00:00
if ( matcher - > root_hosts )
if ( ( rc = cli_ac_buildtrie ( & matcher - > root_hosts [ matcher - > root_hosts_cnt - 1 ] ) ) )
2006-10-15 19:16:33 +00:00
return rc ;
2006-09-12 19:38:39 +00:00
matcher - > list_built = 1 ;
return CL_SUCCESS ;
}
/* Done with this matcher, free resources */
void regex_list_done ( struct regex_matcher * matcher )
{
2006-10-10 23:51:49 +00:00
massert ( matcher ) ;
2006-09-12 19:38:39 +00:00
regex_list_cleanup ( matcher ) ;
if ( matcher - > list_loaded ) {
2006-10-14 23:52:02 +00:00
if ( matcher - > root_hosts ) {
2006-10-15 19:16:33 +00:00
size_t i ;
2007-04-28 20:15:22 +00:00
for ( i = 0 ; i < matcher - > root_hosts_cnt ; i + + )
2006-10-15 19:16:33 +00:00
cli_ac_free ( & matcher - > root_hosts [ i ] ) ;
2006-10-14 23:52:02 +00:00
free ( matcher - > root_hosts ) ;
matcher - > root_hosts = NULL ;
}
2006-09-12 19:38:39 +00:00
2006-10-15 19:16:33 +00:00
matcher - > root_hosts_cnt = 0 ;
2006-09-12 19:38:39 +00:00
matcher - > list_built = 0 ;
destroy_tree ( matcher ) ;
matcher - > list_loaded = 0 ;
}
if ( matcher - > list_inited ) {
matcher - > list_inited = 0 ;
}
stack_destroy ( & matcher - > node_stack ) ;
stack_destroy ( & matcher - > node_stack_alt ) ;
}
/* Tree matcher algorithm */
struct token_t
{
union {
2006-10-10 23:51:49 +00:00
const unsigned char * start ;
char_bitmap_p bitmap ;
unsigned char c ;
2006-09-12 19:38:39 +00:00
} u ;
2006-10-10 23:51:49 +00:00
size_t len ;
char type ;
2006-09-12 19:38:39 +00:00
} ;
enum { TOKEN_CHAR , TOKEN_DOT , TOKEN_PAR_OPEN , TOKEN_PAR_CLOSE , TOKEN_BRACKET , TOKEN_ALT , TOKEN_REGEX , TOKEN_DONE } ;
static const unsigned char * getNextToken ( const unsigned char * pat , struct token_t * token )
{
2006-10-10 23:51:49 +00:00
massert ( pat ) ;
massert ( token ) ;
2006-09-12 19:38:39 +00:00
switch ( * pat ) {
case ' \\ ' :
token - > type = TOKEN_CHAR ;
2006-10-10 23:51:49 +00:00
token - > u . c = * ( + + pat ) ;
if ( islower ( token - > u . c ) ) {
2006-09-12 19:38:39 +00:00
/* handle \n, \t, etc. */
2006-09-25 18:29:02 +00:00
char fmt [ 3 ] = { ' \\ ' , ' \0 ' , ' \0 ' } ;
2006-09-12 19:38:39 +00:00
char c ;
2006-09-25 18:29:02 +00:00
2006-10-10 23:51:49 +00:00
fmt [ 1 ] = token - > u . c ;
if ( snprintf ( & c , 1 , fmt ) ! = 1 ) {
2006-09-12 19:38:39 +00:00
token - > type = TOKEN_REGEX ;
2006-10-10 23:51:49 +00:00
token - > u . start = pat ;
}
2006-09-24 19:28:03 +00:00
else
2006-10-10 23:51:49 +00:00
token - > u . c = c ;
2006-09-12 19:38:39 +00:00
}
2006-09-25 18:29:02 +00:00
token - > len = 1 ;
2006-09-12 19:38:39 +00:00
break ;
case ' | ' :
token - > type = TOKEN_ALT ;
break ;
case ' * ' :
case ' + ' :
case ' ? ' :
case ' { ' :
case ' } ' :
token - > type = TOKEN_REGEX ;
break ;
case ' [ ' :
{
/*TODO: implement*/
/*see if it is something simple like a list of characters, a range, or negated ...*/
const unsigned char * old = pat + + ; /* save this in case we change our mind and decide this is too complicated for us to handle*/
unsigned char range_start = 0 ;
int hasprev = 0 ;
char_bitmap_p bitmap = cli_malloc ( 32 ) ;
if ( ! bitmap )
return NULL ;
if ( * pat = = ' ^ ' ) {
memset ( bitmap , 0xFF , 32 ) ; /*match chars not in brackets*/
pat + + ;
}
else
memset ( bitmap , 0x00 , 32 ) ;
do {
/* literal ] can be first character, so test for it at the end of the loop, for example: []] */
if ( * pat = = ' - ' & & hasprev ) {
/* it is a range*/
unsigned char range_end ;
unsigned int c ;
2006-10-10 23:51:49 +00:00
massert ( range_start ) ;
2006-09-12 19:38:39 +00:00
pat + + ;
if ( pat [ 0 ] = = ' [ ' )
if ( pat [ 1 ] = = ' . ' ) {
if ( pat [ 2 ] = = ' - ' & & pat [ 3 ] = = ' . ' & & pat [ 4 ] = = ' ] ' )
range_end = ' - ' ;
else {
/* this is getting complicated, bail out */
cli_warnmsg ( " confused about collating sequences in regex,bailing out " ) ;
pat = old ;
token - > type = TOKEN_REGEX ;
break ;
}
}
else
range_end = * pat ;
else
range_end = * pat ;
for ( c = range_start + 1 ; c < = range_end ; c + + )
bitmap [ c > > 3 ] ^ = 1 < < ( c & 0x7 ) ;
hasprev = 0 ;
}
else if ( pat [ 0 ] = = ' [ ' & & pat [ 1 ] = = ' : ' ) {
const unsigned char * end ;
int len , found = - 1 ;
size_t i ;
pat + = 2 ;
end = ( unsigned char * ) strstr ( ( const char * ) pat , " :] " ) ;
if ( ! end ) {
cli_warnmsg ( " confused about std char class syntax regex,bailing out " ) ;
pat = old ;
token - > type = TOKEN_REGEX ;
break ;
}
len = end - pat ;
for ( i = 0 ; i < std_class_cnt ; i + + )
2006-10-10 23:51:49 +00:00
if ( ! strncmp ( ( const char * ) pat , std_class [ i ] , len ) ) {
2006-09-12 19:38:39 +00:00
found = i ;
break ;
}
if ( found ! = - 1 ) {
for ( i = 0 ; i < 256 ; i + + )
if ( char_class [ i ] & ( 1 < < found ) )
bitmap [ i > > 3 ] ^ = 1 < < ( i & 0x7 ) ;
}
else {
/*unknown class*/
cli_warnmsg ( " confused about regex bracket expression, bailing out " ) ;
pat = old ;
token - > type = TOKEN_REGEX ;
break ;
}
}
else {
bitmap [ * pat > > 3 ] ^ = 1 < < ( * pat & 0x7 ) ;
pat + + ;
range_start = * pat ;
hasprev = 1 ;
}
} while ( * pat ! = ' ] ' ) ;
/*TODO: see if this bitmap already exists, then reuse*/
token - > type = TOKEN_BRACKET ;
token - > u . bitmap = bitmap ;
break ;
}
case ' ] ' :
2006-10-10 23:51:49 +00:00
massert ( 0 & & " Encountered ] without matching [ " ) ;
2006-09-12 19:38:39 +00:00
/* bad state */
break ;
case ' . ' :
token - > type = TOKEN_DOT ;
break ;
case ' ( ' :
token - > type = TOKEN_PAR_OPEN ;
break ;
case ' ) ' :
token - > type = TOKEN_PAR_CLOSE ;
break ;
default :
token - > type = TOKEN_CHAR ;
2006-10-10 23:51:49 +00:00
token - > u . c = * pat ;
2006-09-12 19:38:39 +00:00
token - > len = 1 ;
break ;
}
return + + pat ;
}
# define INITIAL_ALT_STACK 10
# define ALT_STACK_GROW 20
static const unsigned char * find_regex_start ( const unsigned char * pat )
{
struct token_t token ;
/*TODO: find where the regex part begins, for ex:
* abcd + , regex begins at ' d '
* */
const unsigned char * last = NULL ;
const unsigned char * tmp = NULL ;
const unsigned char * * altpositions = cli_malloc ( INITIAL_ALT_STACK * sizeof ( * altpositions ) ) ;
size_t altpositions_capacity = INITIAL_ALT_STACK ;
size_t altpositions_cnt = 0 ;
char lasttype = - 1 ;
if ( ! altpositions )
return NULL ;
2006-10-10 23:51:49 +00:00
massert ( pat ) ;
2006-09-12 19:38:39 +00:00
/* Try to parse pattern till special regex chars are encountered, that the tree-matcher doesn't handle, like: +,*,{}.
* The tricky part is that once we encounter these , the previous ' atom ' has to be passed on to the regex matcher , so we have to
* back up to the last known good position
* Example , if we have : abc ( defg ) + , then only abc can be handled by tree parser , so we have to return the position of ( .
* Another example : abc ( defg | xyz | oz + | pdo ) , the last known good position is | , after xyz
* TODO : what about open parantheses ? maybe once we found a special char , we have top back out before the first ( ?
* */
do {
tmp = pat ;
pat = getNextToken ( pat , & token ) ;
if ( token . type ! = TOKEN_REGEX ) {
last = tmp ;
lasttype = token . type ;
2006-10-14 23:52:02 +00:00
if ( token . type = = TOKEN_BRACKET & & token . u . bitmap )
2006-09-12 19:38:39 +00:00
free ( token . u . bitmap ) ;
if ( token . type = = TOKEN_ALT | | token . type = = TOKEN_PAR_OPEN ) {
/* save this position on stack, succesfully parsed till here*/
if ( altpositions_cnt & & altpositions [ altpositions_cnt - 1 ] [ 0 ] = = ' | ' )
/* encountered another alternate (|) operator, override previous | position stored */
altpositions [ altpositions_cnt - 1 ] = last ;
else {
altpositions [ altpositions_cnt + + ] = last ;
if ( altpositions_cnt = = altpositions_capacity ) {
altpositions_capacity + = ALT_STACK_GROW ;
2007-05-25 23:10:58 +00:00
altpositions = cli_realloc2 ( altpositions , altpositions_capacity * sizeof ( * altpositions ) ) ;
2006-09-12 19:38:39 +00:00
if ( ! altpositions )
return NULL ;
}
}
} else if ( lasttype = = TOKEN_PAR_CLOSE ) {
/* remove last stored position from stack, succesfully this last group */
altpositions_cnt - - ;
2006-10-10 23:51:49 +00:00
massert ( altpositions_cnt > 0 ) ;
2006-09-12 19:38:39 +00:00
}
}
else {
if ( altpositions_cnt )
last = altpositions [ 0 /*altpositions_cnt-1*/ ] ; /*TODO: which index here?, see above TODO... */
/*last stored 'safe' position where no special (+,*,{}) regex chars were encountered*/
}
} while ( * pat & & token . type ! = TOKEN_REGEX ) ;
free ( altpositions ) ;
return * pat ? last : last + 1 ;
}
static struct tree_node * tree_node_alloc ( struct tree_node * next , char listend )
{
struct tree_node * node = cli_malloc ( sizeof ( * node ) ) ;
if ( node ) {
node - > alternatives = 0 ;
node - > next = next ;
node - > listend = listend ;
node - > u . children = NULL ;
}
return node ;
}
static struct tree_node * tree_root_alloc ( void )
{
struct tree_node * root = tree_node_alloc ( NULL , 1 ) ;
if ( root ) {
root - > op = OP_ROOT ;
root - > c = 0 ;
root - > next = NULL ;
root - > listend = 1 ;
}
return root ;
}
2006-10-10 23:51:49 +00:00
2007-03-11 11:14:35 +00:00
static struct tree_node * tree_node_char_binsearch ( const struct tree_node * node , const char csearch , int * left )
2006-09-12 19:38:39 +00:00
{
int right ;
struct tree_node * * children ;
2006-10-10 23:51:49 +00:00
massert ( node ) ;
massert ( left ) ;
2006-09-12 19:38:39 +00:00
children = tree_node_get_children ( node ) ;
right = node - > alternatives - 1 ;
* left = 0 ;
if ( ! node - > alternatives )
return NULL ;
2006-10-10 23:51:49 +00:00
massert ( children ) ;
2006-09-12 19:38:39 +00:00
while ( * left < = right ) {
int mid = * left + ( right - * left ) / 2 ;
if ( children [ mid ] - > c = = csearch )
return children [ mid ] ;
else if ( children [ mid ] - > c < csearch )
* left = mid + 1 ;
else
right = mid - 1 ;
}
return NULL ;
}
2007-03-11 11:14:35 +00:00
static struct tree_node * tree_get_next ( struct tree_node * node )
2006-09-12 19:38:39 +00:00
{
struct tree_node * * children ;
2006-10-10 23:51:49 +00:00
massert ( node ) ;
2006-09-12 19:38:39 +00:00
children = tree_node_get_children ( node ) ;
if ( ! node - > alternatives & & children & & children [ 0 ] )
return children [ 0 ] ;
else if ( node - > alternatives < = 1 )
return node ;
else
return children [ 0 ] - > next ;
}
2007-03-11 11:14:35 +00:00
static size_t tree_node_get_array_size ( const struct tree_node * node )
2006-09-12 19:38:39 +00:00
{
2006-10-10 23:51:49 +00:00
massert ( node ) ;
2006-09-12 19:38:39 +00:00
/* if op is CUSTOMCLASS, then first pointer is pointer to bitmap, so array size is +1 */
return ( node - > alternatives + ( node - > op = = OP_CUSTOMCLASS ? 1 : 0 ) ) * sizeof ( node - > u . children [ 0 ] ) ;
}
2007-03-11 11:14:35 +00:00
static struct tree_node * tree_node_char_insert ( struct tree_node * node , const char c , int left )
2006-09-12 19:38:39 +00:00
{
struct tree_node * new , * alt = tree_get_next ( node ) ;
2006-10-10 23:51:49 +00:00
struct tree_node * * children ;
2006-09-12 19:38:39 +00:00
node - > alternatives + + ;
2007-05-25 23:10:58 +00:00
node - > u . children = cli_realloc2 ( node - > u . children , tree_node_get_array_size ( node ) ) ;
2006-09-12 19:38:39 +00:00
if ( ! node - > u . children )
return NULL ;
2006-10-10 23:51:49 +00:00
children = node - > op = = OP_CUSTOMCLASS ? node - > u . children + 1 : node - > u . children ;
2006-09-12 19:38:39 +00:00
new = tree_node_alloc ( alt , node = = alt ) ;
if ( new ) {
new - > op = OP_CHAR ;
new - > c = c ;
}
if ( node - > alternatives - left - 1 > 0 )
2006-10-10 23:51:49 +00:00
memmove ( & children [ left + 1 ] , & children [ left ] , ( node - > alternatives - left - 1 ) * sizeof ( node - > u . children [ 0 ] ) ) ;
children [ left ] = new ;
2006-09-12 19:38:39 +00:00
return new ;
}
2007-03-11 11:14:35 +00:00
static void tree_node_insert_nonbin ( struct tree_node * node , struct tree_node * new )
2006-09-12 19:38:39 +00:00
{
struct tree_node * * children ;
2006-10-10 23:51:49 +00:00
massert ( node ) ;
massert ( new ) ;
2006-09-12 19:38:39 +00:00
children = tree_node_get_children ( node ) ;
if ( node - > alternatives ) {
2006-10-10 23:51:49 +00:00
massert ( children ) ;
2006-09-12 19:38:39 +00:00
if ( children [ 0 ] - > next = = node ) {
int i ;
new - > listend = 1 ;
for ( i = 0 ; i < node - > alternatives ; i + + ) {
children [ i ] - > next = new ;
children [ i ] - > listend = 0 ;
}
}
else {
struct tree_node * p ;
for ( p = children [ 0 ] - > next ; p - > next ! = node ; p = p - > next )
2006-10-10 23:51:49 +00:00
massert ( ! p - > listend ) ;
2006-09-12 19:38:39 +00:00
new - > listend = 1 ;
p - > listend = 0 ;
p - > next = new ;
}
}
else {
2006-09-24 19:28:03 +00:00
int idx = node - > op = = OP_CUSTOMCLASS ? 1 : 0 ;
2006-09-12 19:38:39 +00:00
if ( node - > u . children )
2006-09-24 19:28:03 +00:00
if ( node - > u . children [ idx ] ) {
node = node - > u . children [ idx ] ;
while ( node - > next & & ! node - > listend )
node = node - > next ;
node - > listend = 0 ;
2007-06-02 15:18:31 +00:00
if ( new - > next = = node ) {
new - > next = node - > next ;
}
2006-09-24 19:28:03 +00:00
node - > next = new ;
new - > listend = 1 ;
2007-02-17 19:16:19 +00:00
return ;
2006-09-24 19:28:03 +00:00
}
2007-05-25 23:10:58 +00:00
node - > u . children = cli_realloc2 ( node - > u . children , sizeof ( node - > u . children [ 0 ] ) * ( 2 ) ) ;
2006-09-24 19:28:03 +00:00
if ( node - > u . children ) {
node - > u . children [ idx ] = new ;
}
2006-09-12 19:38:39 +00:00
}
}
2007-03-11 11:14:35 +00:00
static unsigned char char_getclass ( const unsigned char * bitmap )
2006-09-12 19:38:39 +00:00
{
size_t i ;
2006-10-10 23:51:49 +00:00
massert ( bitmap ) ;
2006-09-12 19:38:39 +00:00
for ( i = 0 ; i < std_class_cnt ; i + + )
if ( ! memcmp ( bitmap , char_class_bitmap [ i ] , 256 > > 3 ) )
return i ;
return std_class_cnt ;
}
static void stack_destroy ( struct node_stack * stack )
{
2006-10-10 23:51:49 +00:00
massert ( stack ) ;
2006-09-12 19:38:39 +00:00
if ( stack - > data )
free ( stack - > data ) ;
stack - > data = NULL ;
stack - > capacity = 0 ;
}
/* call this after whitelist load is complete, and the tree is no longer going to be modified */
void regex_list_cleanup ( struct regex_matcher * matcher )
{
2006-10-10 23:51:49 +00:00
massert ( matcher ) ;
2006-09-12 19:38:39 +00:00
stack_destroy ( & matcher - > node_stack ) ;
stack_destroy ( & matcher - > node_stack_alt ) ;
stack_init ( & matcher - > node_stack ) ;
stack_init ( & matcher - > node_stack_alt ) ;
}
int is_regex_ok ( struct regex_matcher * matcher )
{
2006-10-10 23:51:49 +00:00
massert ( matcher ) ;
2006-09-12 19:38:39 +00:00
return ( ! matcher - > list_inited | | matcher - > list_inited ! = - 1 ) ; /* either we don't have a regexlist, or we initialized it successfully */
}
/* returns 0 on success, regexec error code otherwise */
2007-03-18 23:27:15 +00:00
static int add_pattern ( struct regex_matcher * matcher , const unsigned char * pat , const char * info , int hostonly )
2006-09-12 19:38:39 +00:00
{
int bol = 1 ;
const unsigned char * pat_end = find_regex_start ( pat ) ;
struct token_t token ;
struct tree_node * node ;
2006-10-10 23:51:49 +00:00
massert ( matcher ) ;
2006-09-12 19:38:39 +00:00
2007-03-18 23:27:15 +00:00
node = hostonly ? matcher - > root_regex_hostonly : matcher - > root_regex ;
2006-09-12 19:38:39 +00:00
stack_reset ( & matcher - > node_stack ) ;
stack_reset ( & matcher - > node_stack_alt ) ;
stack_push ( & matcher - > node_stack , node ) ;
for ( ; node - > op ! = OP_LEAF ; ) {
if ( pat < pat_end )
pat = getNextToken ( pat , & token ) ;
else if ( * pat ) {
token . type = TOKEN_REGEX ;
token . u . start = pat ;
}
else
token . type = TOKEN_DONE ;
switch ( token . type ) {
case TOKEN_CHAR :
{
/* search for char in tree */
int left ;
2006-10-10 23:51:49 +00:00
struct tree_node * newnode = tree_node_char_binsearch ( node , token . u . c , & left ) ;
2006-09-12 19:38:39 +00:00
if ( newnode )
node = newnode ;
else {
/* not found, insert it */
2006-10-10 23:51:49 +00:00
node = tree_node_char_insert ( node , token . u . c , left ) ;
2006-09-12 19:38:39 +00:00
}
break ;
}
case TOKEN_PAR_OPEN :
stack_push ( & matcher - > node_stack_alt , NULL ) ; /* marker */
stack_push ( & matcher - > node_stack , node ) ;
break ;
case TOKEN_PAR_CLOSE : {
/*TODO: test this!!!*/
struct tree_node * node_alt = node ;
node = tree_node_alloc ( NULL , 1 ) ;
node - > op = OP_PARCLOSE ;
node - > c = 0 ;
node - > listend = 1 ;
tree_node_insert_nonbin ( node_alt , node ) ;
while ( ( node_alt = stack_pop ( & matcher - > node_stack_alt ) ) ) {
tree_node_insert_nonbin ( node_alt , node ) ;
}
stack_pop ( & matcher - > node_stack ) ;
break ;
}
case TOKEN_ALT :
stack_push ( & matcher - > node_stack_alt , node ) ;
node = stack_pop ( & matcher - > node_stack ) ;
stack_push ( & matcher - > node_stack , node ) ;
break ;
case TOKEN_BRACKET :
{
struct tree_node * new = tree_node_alloc ( tree_get_next ( node ) , 1 ) ;
2006-10-10 23:51:49 +00:00
unsigned char charclass = char_getclass ( token . u . bitmap ) ;
2006-09-12 19:38:39 +00:00
if ( charclass = = std_class_cnt ) { /*not a std char class*/
new - > op = OP_CUSTOMCLASS ;
new - > u . children = cli_malloc ( sizeof ( new - > u . children [ 0 ] ) * 2 ) ;
2006-10-14 23:52:02 +00:00
if ( ! new - > u . children )
return CL_EMEM ;
2006-09-12 19:38:39 +00:00
new - > u . bitmap [ 0 ] = token . u . bitmap ;
new - > u . bitmap [ 1 ] = NULL ;
tree_node_insert_nonbin ( node , new ) ;
node = new ;
}
else {
new - > op = OP_STDCLASS ;
new - > c = charclass ;
tree_node_insert_nonbin ( node , new ) ;
node = new ;
}
break ;
}
case TOKEN_DOT :
{
struct tree_node * new = tree_node_alloc ( tree_get_next ( node ) , 1 ) ;
new - > op = OP_DOT ;
tree_node_insert_nonbin ( node , new ) ;
node = new ;
break ;
}
case TOKEN_REGEX :
case TOKEN_DONE : {
struct leaf_info * leaf = cli_malloc ( sizeof ( * leaf ) ) ;
2006-10-14 23:52:02 +00:00
if ( ! leaf )
return CL_EMEM ;
2007-03-11 11:14:35 +00:00
leaf - > info = cli_strdup ( info ) ;
2006-09-12 19:38:39 +00:00
if ( token . type = = TOKEN_REGEX ) {
int rc ;
struct tree_node * new ;
regex_t * preg ;
preg = cli_malloc ( sizeof ( * preg ) ) ;
2006-10-14 23:52:02 +00:00
if ( ! preg )
return CL_EMEM ;
2006-09-24 19:28:03 +00:00
rc = regcomp ( preg , ( const char * ) token . u . start , REG_EXTENDED | ( bol ? 0 : REG_NOTBOL ) ) ;
2006-09-12 19:38:39 +00:00
leaf - > preg = preg ;
if ( rc )
return rc ;
new = cli_malloc ( sizeof ( * new ) ) ;
2006-10-14 23:52:02 +00:00
if ( ! new )
return CL_EMEM ;
2006-09-12 19:38:39 +00:00
new - > op = OP_LEAF ;
new - > next = node ;
new - > alternatives = 0 ;
new - > u . leaf = leaf ;
new - > listend = 1 ;
tree_node_insert_nonbin ( node , new ) ;
}
else {
leaf - > preg = NULL ;
node - > alternatives = 0 ;
node - > u . leaf = leaf ;
node - > op = OP_LEAF ;
}
return 0 ;
}
}
bol = 0 ;
}
return 0 ;
}
/* c has to be unsigned char here!! */
static int match_node ( struct tree_node * node , const unsigned char * c , size_t len , const char * * info )
{
struct tree_node * * children ;
int rc ;
2006-10-10 23:51:49 +00:00
massert ( node ) ;
massert ( c ) ;
massert ( info ) ;
2006-09-12 19:38:39 +00:00
2006-09-16 15:49:27 +00:00
if ( ! node - > u . children )
return MATCH_FAILED ; /* tree empty */
2006-09-12 19:38:39 +00:00
* info = NULL ;
len + + ;
c - - ;
for ( ; ; ) {
2006-10-10 23:51:49 +00:00
massert ( node ) ;
2006-09-12 19:38:39 +00:00
children = node - > u . children ;
switch ( node - > op ) {
case OP_ROOT :
rc = 1 ;
break ;
case OP_PARCLOSE :
/*this isn't a real character, so don't move*/
c - - ;
len + + ;
rc = 1 ;
break ;
case OP_CHAR :
2006-10-10 23:51:49 +00:00
massert ( * c = = node - > c & & " We know this has to match " ) ;
2006-09-12 19:38:39 +00:00
rc = 1 ; /* *c==node->c;- we know it has matched */
break ;
case OP_DOT :
rc = * c ! = ' \n ' ;
break ;
case OP_STDCLASS :
rc = char_class [ * c ] & ( node - > c ) ;
break ;
case OP_CUSTOMCLASS :
{
char_bitmap_p bitmap ;
2006-10-10 23:51:49 +00:00
massert ( children ) ;
2006-09-12 19:38:39 +00:00
bitmap = ( char_bitmap_p ) node - > u . bitmap [ 0 ] ;
children + + ;
rc = bitmap [ * c > > 3 ] & ( 1 < < ( * c & 0x7 ) ) ;
break ;
}
case OP_LEAF :
{
const struct leaf_info * leaf = node - > u . leaf ;
/*isleaf = 1;*/
if ( leaf - > preg ) {
rc = ! regexec ( leaf - > preg , ( const char * ) c , 0 , NULL , 0 ) ;
}
else {
2006-10-10 23:51:49 +00:00
massert ( * c = = node - > c & & " We know this has to match[2] " ) ;
2006-09-12 19:38:39 +00:00
rc = 1 ;
}
if ( rc ) {
* info = leaf - > info ;
return MATCH_SUCCESS ;
}
break ;
}
default :
/* impossible */
cli_errmsg ( " Encountered invalid operator in tree:%d \n " , node - > op ) ;
exit ( 1 ) ;
}
len - - ;
if ( ! len ) rc = 0 ;
c + + ;
if ( rc ) {
const char csearch = * c ;
int left = 0 , right = node - > alternatives - 1 ;
int mid ;
/*matched so far, go deeper*/
/*do a binary search between children */
2006-10-10 23:51:49 +00:00
massert ( children ) ;
2006-09-12 19:38:39 +00:00
while ( left < = right ) {
mid = left + ( right - left ) / 2 ;
if ( children [ mid ] - > c = = csearch )
break ;
else if ( children [ mid ] - > c < csearch )
left = mid + 1 ;
else
right = mid - 1 ;
}
if ( left < = right ) {
node = children [ mid ] ;
2006-10-10 23:51:49 +00:00
massert ( node ) ;
2006-09-12 19:38:39 +00:00
}
else {
if ( node - > alternatives ) {
if ( ! children [ 0 ] - > listend ) {
node = children [ 0 ] ;
c + + ;
len - - ;
}
while ( node & & node - > listend ) {
node = node - > next ; /* climb up */
c - - ;
len + + ;
}
if ( ! node | | ! node - > next )
return MATCH_FAILED ; /* reached root node */
node = node - > next ;
c - - ;
len + + ;
}
else if ( node - > u . children ) {
struct tree_node * rewrite_next = NULL ;
if ( node - > op = = OP_PARCLOSE )
rewrite_next = node ;
node = children [ 0 ] ;
2006-10-10 23:51:49 +00:00
massert ( node ) ;
massert ( node - > op ! = OP_CHAR ) ;
2006-09-12 19:38:39 +00:00
if ( rewrite_next )
node - > next = rewrite_next ; /* this node is pointed to by several parent nodes,
we need to know
from which one we came , so we can find out way back
should we fail to match somewhere deeper */
}
}
}
else {
/* this node didn't match, try sibling, or parent (if no more siblings) */
while ( node & & node - > listend ) {
node = node - > next ; /* sibling of parent */
c - - ;
len + + ;
}
if ( ! node | | ! node - > next ) /* reached root node, it has no next */
return MATCH_FAILED ;
2006-09-24 19:28:03 +00:00
else {
c - - ;
len + + ;
node = node - > next ;
}
2006-09-12 19:38:39 +00:00
}
}
return MATCH_FAILED ;
}
/* push node on stack, only if it isn't there already */
2007-03-11 11:14:35 +00:00
static void stack_push_once ( struct node_stack * stack , struct tree_node * node )
2006-09-12 19:38:39 +00:00
{
size_t i ;
2006-10-10 23:51:49 +00:00
massert ( stack ) ;
massert ( node ) ;
2006-09-12 19:38:39 +00:00
for ( i = 0 ; i < stack - > cnt ; i + + )
if ( stack - > data [ i ] = = node )
return ;
stack_push ( stack , node ) ;
}
static void destroy_tree_internal ( struct regex_matcher * matcher , struct tree_node * node )
{
struct tree_node * * children ;
2006-10-10 23:51:49 +00:00
massert ( matcher ) ;
massert ( node ) ;
2006-09-12 19:38:39 +00:00
children = tree_node_get_children ( node ) ;
if ( node - > op = = OP_LEAF ) {
struct leaf_info * leaf = node - > u . leaf ;
if ( node - > next & & ! node - > listend )
destroy_tree_internal ( matcher , node - > next ) ;
stack_push_once ( & matcher - > node_stack , ( struct tree_node * ) node - > u . leaf ) ; /* cast to make compiler happy, and to not make another stack implementation for storing void* */
stack_push_once ( & matcher - > node_stack , node ) ;
if ( leaf - > preg ) {
regfree ( leaf - > preg ) ;
free ( leaf - > preg ) ;
leaf - > preg = NULL ;
}
if ( leaf - > info ) {
free ( leaf - > info ) ;
leaf - > info = NULL ;
}
/* return;*/
}
if ( node - > alternatives ) {
int i ;
struct tree_node * p ;
2006-10-10 23:51:49 +00:00
massert ( children ) ;
2006-09-12 19:38:39 +00:00
p = children [ 0 ] - > op = = OP_LEAF ? NULL : children [ 0 ] - > next ;
for ( i = 0 ; i < node - > alternatives ; i + + )
destroy_tree_internal ( matcher , children [ i ] ) ;
if ( p & & p ! = node )
destroy_tree_internal ( matcher , p ) ; /*?? is this ok, or without _internal?*/
}
else {
if ( children ) {
if ( children [ 0 ] )
destroy_tree_internal ( matcher , children [ 0 ] ) ;
}
}
2006-09-24 19:28:03 +00:00
if ( node - > op ! = OP_LEAF & & node - > next & & ! node - > listend )
2006-09-12 19:38:39 +00:00
destroy_tree_internal ( matcher , node - > next ) ;
if ( node - > u . children )
stack_push_once ( & matcher - > node_stack , ( struct tree_node * ) node - > u . children ) ; /* cast to make compiler happy, it isn't really a tree_node* */
if ( node - > op = = OP_CUSTOMCLASS & & node - > u . children [ 0 ] ) {
free ( node - > u . children [ 0 ] ) ;
node - > u . children [ 0 ] = NULL ;
}
stack_push_once ( & matcher - > node_stack , node ) ;
}
static void destroy_tree ( struct regex_matcher * matcher )
{
/* we might have the same node linked by different nodes, so a recursive walk&free doesn't work in all situations,
* i . e . it might double - free , so instead of freeing , just push the nodes on a stack , and later free the nodes in that stack ,
* ( and push to stack only if it doesn ' t contain it already */
2006-10-10 23:51:49 +00:00
massert ( matcher ) ;
2006-09-12 19:38:39 +00:00
stack_reset ( & matcher - > node_stack ) ;
destroy_tree_internal ( matcher , matcher - > root_regex ) ;
2007-03-18 23:27:15 +00:00
destroy_tree_internal ( matcher , matcher - > root_regex_hostonly ) ;
2006-09-12 19:38:39 +00:00
while ( matcher - > node_stack . cnt ) {
struct tree_node * node = stack_pop ( & matcher - > node_stack ) ;
2006-10-14 23:52:02 +00:00
if ( node )
free ( node ) ;
2006-09-12 19:38:39 +00:00
}
}
# ifndef NDEBUG
static void dump_node ( struct tree_node * node )
{
int i ;
struct tree_node * p , * * children ;
2006-10-10 23:51:49 +00:00
massert ( node ) ;
2006-09-12 19:38:39 +00:00
if ( node - > op = = OP_LEAF ) {
if ( node - > u . leaf - > preg )
printf ( " n%p [label= \" regex \\ nleaf \" ] " , ( void * ) node ) ;
else
printf ( " n%p [label= \" %c \\ nleaf \" ]; \n " , ( void * ) node , node - > c ) ;
if ( node - > next & & ! node - > listend ) {
printf ( " n%p -> n%p; \n " , ( void * ) node , ( void * ) node - > next ) ;
dump_node ( node - > next ) ;
}
return ;
}
printf ( " n%p [label= \" %c \\ n%d \\ nlistend:%d \" ]; \n " , ( void * ) node , ( node - > op = = OP_ROOT | | node - > op = = OP_PARCLOSE ) ? ' @ ' : node - > c , node - > op , node - > listend ) ;
if ( node - > next )
printf ( " n%p -> n%p; \n " , ( void * ) node , ( void * ) node - > next ) ;
printf ( " n%p -> { " , ( void * ) node ) ; /*using address of node as id*/
children = tree_node_get_children ( node ) ;
if ( node - > alternatives )
2006-10-10 23:51:49 +00:00
massert ( children ) ;
2006-09-12 19:38:39 +00:00
for ( i = 0 ; i < node - > alternatives ; i + + )
printf ( " n%p " , ( void * ) children [ i ] ) ;
if ( node - > alternatives & & children [ 0 ] - > op ! = OP_LEAF )
for ( p = children [ 0 ] - > next ; p ! = node ; p = p - > next )
{
2006-10-10 23:51:49 +00:00
massert ( p ) ;
2006-09-12 19:38:39 +00:00
printf ( " n%p " , ( void * ) p ) ;
if ( p - > op = = OP_LEAF | | p - > listend )
break ;
}
if ( ! node - > alternatives & & children & & children [ 0 ] )
printf ( " n%p " , ( void * ) children [ 0 ] ) ;
printf ( " }; \n " ) ;
printf ( " {rank=same; " ) ;
for ( i = 0 ; i < node - > alternatives ; i + + )
printf ( " n%p " , ( void * ) node - > u . children [ i ] ) ;
if ( node - > alternatives & & children [ 0 ] - > op ! = OP_LEAF )
for ( p = children [ 0 ] - > next ; p ! = node ; p = p - > next )
{
printf ( " n%p " , ( void * ) p ) ;
if ( p - > op = = OP_LEAF | | p - > listend )
break ;
}
if ( ! node - > alternatives & & children & & children [ 0 ] )
printf ( " n%p " , ( void * ) children [ 0 ] ) ;
printf ( " }; \n " ) ;
for ( i = 0 ; i < node - > alternatives ; i + + )
dump_node ( children [ i ] ) ;
if ( node - > alternatives & & children [ 0 ] - > op ! = OP_LEAF )
for ( p = children [ 0 ] - > next ; p ! = node ; p = p - > next )
{
dump_node ( p ) ;
if ( p - > op = = OP_LEAF | | p - > listend )
break ;
}
if ( ! node - > alternatives & & children & & children [ 0 ] )
dump_node ( children [ 0 ] ) ;
}
void dump_tree ( struct tree_node * root )
{
/*use dot/dotty from graphviz to view it*/
2006-10-10 23:51:49 +00:00
massert ( root ) ;
2006-09-12 19:38:39 +00:00
printf ( " digraph tree { \n " ) ;
dump_node ( root ) ;
printf ( " } \n " ) ;
}
# endif
2007-06-09 16:09:45 +00:00
# else
/*------------------------New version of regex_list.c------------------------*/
/* Regex_list.c:
* A scalable , trie - based implementation for matching against
* a list of regular expressions .
*
* A trivial way to implement matching against a list of regular expressions
* would have been to construct a single regular expression , by concatenating
* the list with the alternate ( | ) operator .
* BUT a usual DFA implementation of regular expression matching ( eg . : GNU libc )
* leads to " state explosion " when there are many ( 5000 + ) alternate ( | ) operators .
* This is the reason for using a trie - based implementation .
*
*
* Design considerations :
*
* Recursive call points : there are situations when match has to be retried on a different sub - trie , or with a different repeat count .
* Alternate operators , and repeat / range operators ( + , * , { } ) are recursiv call points . When a failure is encountered during a match ,
* the function simply returns from the recursive call , and ends up at a failure point ( recursive call point ) .
*
* " go to parent " below actually means , return from recursive call .
*
2007-06-09 20:46:40 +00:00
* fail_action : we need to return to closest failure point ( recursive call point ) ,
* and switch current node to node pointed by fail_action
*
2007-06-09 16:09:45 +00:00
* Node types :
* OP_ROOT : contains information that applies to the entire trie .
* it can only appear as root node , and not as child node .
* On child fail : match has failed
2007-06-09 20:46:40 +00:00
* This is NOT a recursive call point
2007-06-09 16:09:45 +00:00
* OP_CHAR_BINARY_SEARCH : chooses a sub - trie , based on current character ;
* using binary - search
* On fail : go to node indicated by fail_action , or if
* fail_action is NULL , to parent
* On child fail : execute fail of current node
* OP_ALTERNATIVES : try matching each sub - trie , if all fails execute fail
* action of current node . This is a recursive call point
* OP_CHAR_REPEAT : repeat specified character a number of times in range :
* [ min_range , max_range ] ;
* min_range : 0 for * operator
* 1 for + operator
* max_range : remaining length of current string for * , + operator
* OR : min_range , max_range as specified by the { min , max } operator
* On fail : fail_action , or parent if NULL
* On child fail : reduce match repeat count , try again on child , if
* repeat count < min_range , execute fail of current node
2007-06-09 20:46:40 +00:00
* Also has a bitmap on what characters are accepted beyond it ,
* as an optimizations for the case , when a maximum match isn ' t possible
2007-06-09 16:09:45 +00:00
* Not recomended to use this when min_range = max_range = 1
* This is a recursive call point
* OP_DOT_REPEAT : like OP_CHAR_REPEAT but accept any character
* Not recomended to use this when min_range = max_range = 1
* This is a recursive call point
* OP_GROUP_START : start of a group " ( " , also specifies group flags :
* repeat : is_repeat , min_range , max_range
* This is a recursive call point if is_repeat
* OP_GROUP_END : end of group " ) "
* OP_STRCMP : compare with specified string ,
* it has an array of fail actions , one for each character
* default fail action : go to parent
* This was introduced from memory - and speed - efficiency
* considerations .
* OP_CHAR_CLASS_REPEAT : match character with character class
* min_range , max_range
* For a single character class min_range = max_range = 1
* OP_MATCH_OK : match has succeeded
*
* TODO : maybe we ' ll need a more efficient way to choose between character classes .
* OP_DOT_REPEAT / OP_CHAR_REPEAT needs a more efficient specification of its failure function , instead of using
* backtracking approach .
*
* The failure function / action is just for optimization , the match algorithms works even without it .
* TODO : In this first draft fail action will always be NULL , in a later version I ' ll implement fail actions too .
*
*
*/
2007-06-09 20:46:40 +00:00
# include <string.h>
2007-06-09 16:09:45 +00:00
# include "cltypes.h"
# include "others.h"
/* offsetof is not ANSI C */
# ifndef offsetof
# define offsetof(type,memb) ((size_t)&((type*)0)->memb)
# endif
# define container_of(ptr, type, member) ( (type *) ((char *)ptr - offsetof(type, member)) )
# define container_of_const(ptr, type, member) ( (type *) ((const char *)ptr - offsetof(type, member)) )
enum trie_node_type {
OP_ROOT ,
OP_CHAR_BINARY_SEARCH ,
OP_ALTERNATIVES ,
OP_CHAR_REPEAT ,
OP_DOT_REPEAT ,
OP_CHAR_CLASS_REPEAT ,
OP_STRCMP ,
OP_GROUP_START ,
OP_GROUP_END ,
OP_MATCH_OK
} ;
/* the comon definition of a trie node */
struct trie_node
{
enum trie_node_type type ;
} ;
struct trie_node_match {
struct trie_node node ;
/* additional match info */
} ;
struct trie_node_root
{
struct trie_node node ;
struct trie_node * child ;
} ;
struct trie_node_binary_search
{
struct trie_node node ;
uint8_t children_count ; /* number of children to search among -1! 255 = 256 children*/
struct trie_node * fail_action ;
2007-06-09 20:46:40 +00:00
unsigned char * char_choices ; /* children_count elements */
struct trie_node * * children ; /*children_count elements */
2007-06-09 16:09:45 +00:00
} ;
struct trie_node_alternatives
{
struct trie_node node ;
uint32_t alternatives_count ;
/* need to support node with lots of alternatives,
* for a worst - case scenario where each line ends up as a sub - trie of OP_ALTERNATIVES */
struct trie_node * fail_action ;
struct trie_node * * children ;
} ;
struct trie_node_char_repeat
{
struct trie_node node ;
unsigned char character ;
uint8_t range_min , range_max ; /* according to POSIX we need not support more than 255 repetitions*/
2007-06-09 20:46:40 +00:00
struct char_bitmap * bitmap_accept_after ; /* bitmap of characters accepted after this,
to optimize repeat < max_range case ; if its NULL
there is no optimization */
2007-06-09 16:09:45 +00:00
struct trie_node * child ;
struct trie_node * fail_action ;
} ;
struct trie_node_dot_repeat
{
struct trie_node node ;
uint8_t range_min , range_max ; /* according to POSIX we need not support more than 255 repetitions*/
2007-06-09 20:46:40 +00:00
struct char_bitmap * bitmap_accept_after ; /* bitmap of characters accepted after this,
to optimize repeat < max_range case ; if its NULL
there is no optimization */
2007-06-09 16:09:45 +00:00
struct trie_node * child ;
struct trie_node * fail_action ;
} ;
struct trie_node_group_start
{
struct trie_node node ;
uint8_t range_min , range_max ; /* if range_min==range_max==1, then this is NOT a repeat, thus not a recursive call point*/
struct trie_node * child ;
struct trie_node * fail_action ;
} ;
struct trie_node_group_end
{
struct trie_node node ;
struct trie_node * child ;
} ;
struct trie_node_strcmp
{
struct trie_node node ;
uint8_t string_length ; /* for longer strings a sequence of node_strcmp should be used */
unsigned char * string ;
struct trie_node * child ;
2007-06-09 20:46:40 +00:00
struct trie_node * * fail_actions ; /* this has string_length elements, or NULL if no fail_actions are computed */
2007-06-09 16:09:45 +00:00
} ;
struct trie_node_char_class_repeat
{
struct trie_node node ;
struct char_bitmap * bitmap ;
2007-06-09 20:46:40 +00:00
struct char_bitmap * bitmap_accept_after ;
2007-06-09 16:09:45 +00:00
uint8_t range_min , range_max ;
struct trie_node * child ;
struct trie_node * fail_action ;
} ;
2007-06-09 20:46:40 +00:00
static inline int bitmap_accepts ( const struct char_bitmap * bitmap , const char c )
{
/* TODO: check if c is accepted by bitmap */
return 0 ;
}
# define MATCH_FAILED 0
# define MATCH_OK 1
2007-06-09 16:09:45 +00:00
2007-06-09 20:46:40 +00:00
# define FAIL_ACTION( fail_node ) (*fail_action = (fail_node), MATCH_FAILED)
# ifndef MIN
# define MIN(a,b) ((a)<(b) ? (a) : (b))
# endif
static int match_node ( const struct trie_node * node , const unsigned char * text , const unsigned char * text_end , const struct trie_node * * fail_action ) ;
static int match_repeat ( const unsigned char * text , const unsigned char * text_end , const size_t range_min , const size_t repeat_start ,
const struct char_bitmap * bitmap_accept_after , const struct trie_node * child , const struct trie_node * * fail_action ,
const struct trie_node * this_fail_action )
2007-06-09 16:09:45 +00:00
{
2007-06-09 20:46:40 +00:00
size_t i ;
for ( i = repeat_start ; i > range_min ; i - - ) {
if ( ! bitmap_accept_after | | bitmap_accepts ( bitmap_accept_after , text [ i - 1 ] ) ) {
int rc = match_node ( child , & text [ i ] , text_end , fail_action ) ;
/* ignore fail_action for now, we have the bitmap_accepts_after optimization */
if ( rc ) {
return MATCH_OK ;
}
}
}
if ( ! range_min ) {
/* this match is optional, try child only */
int rc = match_node ( child , text , text_end , fail_action ) ;
if ( rc ) {
return MATCH_OK ;
}
}
return FAIL_ACTION ( this_fail_action ) ;
}
/* text_end points to \0 in text */
static int match_node ( const struct trie_node * node , const unsigned char * text , const unsigned char * text_end , const struct trie_node * * fail_action )
{
while ( node & & text < text_end ) {
2007-06-09 16:09:45 +00:00
switch ( node - > type ) {
case OP_ROOT :
{
const struct trie_node_root * root_node = container_of_const ( node , const struct trie_node_root , node ) ;
node = root_node - > child ;
break ;
}
case OP_CHAR_BINARY_SEARCH :
2007-06-09 20:46:40 +00:00
{
2007-06-09 16:09:45 +00:00
const struct trie_node_binary_search * bin_node = container_of_const ( node , const struct trie_node_binary_search , node ) ;
2007-06-09 20:46:40 +00:00
const unsigned char csearch = * text ;
size_t mid , left = 0 , right = bin_node - > children_count - 1 ;
while ( left < = right ) {
mid = left + ( right - left ) / 2 ;
if ( bin_node - > char_choices [ mid ] = = csearch )
break ;
else if ( bin_node - > char_choices [ mid ] < csearch )
left = mid + 1 ;
else
right = mid - 1 ;
}
if ( left < = right ) {
/* match successful */
node = bin_node - > children [ mid ] ;
+ + text ;
}
else {
return FAIL_ACTION ( bin_node - > fail_action ) ;
}
2007-06-09 16:09:45 +00:00
break ;
}
case OP_ALTERNATIVES :
{
const struct trie_node_alternatives * alt_node = container_of_const ( node , const struct trie_node_alternatives , node ) ;
2007-06-09 20:46:40 +00:00
size_t i ;
* fail_action = NULL ;
for ( i = 0 ; i < alt_node - > alternatives_count ; i + + ) {
int rc = match_node ( alt_node - > children [ i ] , text , text_end , fail_action ) ;
if ( rc ) {
return MATCH_OK ;
}
/* supporting fail_actions is tricky,
* if we just go to the node specified , what happens if the match fails , and no
* further fail_action is specified ? We should know where to continue the search .
* For now fail_action isn ' t supported for OP_ALTERNATIVES */
}
2007-06-09 16:09:45 +00:00
break ;
}
case OP_CHAR_REPEAT :
{
const struct trie_node_char_repeat * char_rep_node = container_of_const ( node , const struct trie_node_char_repeat , node ) ;
2007-06-09 20:46:40 +00:00
const size_t max_len = MIN ( text_end - text , char_rep_node - > range_max - 1 ) ;
/* todo: what about the 8 bit limitation of range_max, and what about inf (+,*)? */
const char caccept = char_rep_node - > character ;
size_t rep ;
if ( max_len < char_rep_node - > range_min )
return FAIL_ACTION ( char_rep_node - > fail_action ) ;
for ( rep = 0 ; rep < max_len ; rep + + ) {
if ( text [ rep ] ! = caccept ) {
break ;
}
}
return match_repeat ( text , text_end , char_rep_node - > range_min , rep ,
char_rep_node - > bitmap_accept_after , char_rep_node - > child , fail_action ,
char_rep_node - > fail_action ) ;
2007-06-09 16:09:45 +00:00
}
case OP_DOT_REPEAT :
{
const struct trie_node_dot_repeat * dot_rep_node = container_of_const ( node , const struct trie_node_dot_repeat , node ) ;
2007-06-09 20:46:40 +00:00
const size_t max_len = MIN ( text_end - text , dot_rep_node - > range_max - 1 ) ;
/* todo: what about the 8 bit limitation of range_max, and what about inf (+,*)? */
if ( max_len < dot_rep_node - > range_min )
return FAIL_ACTION ( dot_rep_node - > fail_action ) ;
return match_repeat ( text , text_end , dot_rep_node - > range_min , max_len ,
dot_rep_node - > bitmap_accept_after , dot_rep_node - > child , fail_action ,
dot_rep_node - > fail_action ) ;
2007-06-09 16:09:45 +00:00
}
case OP_CHAR_CLASS_REPEAT :
{
const struct trie_node_char_class_repeat * class_rep_node = container_of_const ( node , const struct trie_node_char_class_repeat , node ) ;
2007-06-09 20:46:40 +00:00
const size_t max_len = MIN ( text_end - text , class_rep_node - > range_max - 1 ) ;
/* todo: what about the 8 bit limitation of range_max, and what about inf (+,*)? */
size_t rep ;
if ( max_len < class_rep_node - > range_min )
return FAIL_ACTION ( class_rep_node - > fail_action ) ;
for ( rep = 0 ; rep < max_len ; rep + + ) {
if ( ! bitmap_accepts ( class_rep_node - > bitmap , text [ rep ] ) ) {
break ;
}
}
return match_repeat ( text , text_end , class_rep_node - > range_min , rep ,
class_rep_node - > bitmap_accept_after , class_rep_node - > child , fail_action ,
class_rep_node - > fail_action ) ;
2007-06-09 16:09:45 +00:00
break ;
}
case OP_STRCMP :
{
const struct trie_node_strcmp * strcmp_node = container_of_const ( node , const struct trie_node_strcmp , node ) ;
2007-06-09 20:46:40 +00:00
size_t i ;
if ( strcmp_node - > fail_actions ) {
const size_t max_len = MIN ( strcmp_node - > string_length , text_end - text ) ;
/* we don't use strncmp, because we need the exact match-fail point */
for ( i = 0 ; i < max_len ; i + + ) {
if ( text [ i ] ! = strcmp_node - > string [ i ] ) {
return FAIL_ACTION ( strcmp_node - > fail_actions [ i ] ) ;
}
}
if ( max_len < strcmp_node - > string_length ) {
/* failed, because text was shorter */
return FAIL_ACTION ( strcmp_node - > fail_actions [ max_len ] ) ;
}
}
else {
/* no fail_actions computed, some shortcuts possible on compare */
if ( ( text_end - text < strcmp_node - > string_length ) | |
strncmp ( ( const char * ) text , ( const char * ) strcmp_node - > string , strcmp_node - > string_length ) ) {
return FAIL_ACTION ( NULL ) ;
}
}
/* match successful */
node = strcmp_node - > child ;
text + = strcmp_node - > string_length ;
2007-06-09 16:09:45 +00:00
break ;
}
case OP_GROUP_START :
{
const struct trie_node_group_start * group_start_node = container_of_const ( node , const struct trie_node_group_start , node ) ;
2007-06-09 20:46:40 +00:00
/* TODO: implement */
2007-06-09 16:09:45 +00:00
break ;
}
case OP_GROUP_END :
2007-06-09 20:46:40 +00:00
{
2007-06-09 16:09:45 +00:00
const struct trie_node_group_end * group_end_node = container_of_const ( node , const struct trie_node_group_end , node ) ;
2007-06-09 20:46:40 +00:00
/* TODO: implement */
2007-06-09 16:09:45 +00:00
break ;
}
2007-06-09 20:46:40 +00:00
case OP_MATCH_OK :
2007-06-09 16:09:45 +00:00
{
2007-06-09 20:46:40 +00:00
return MATCH_OK ;
2007-06-09 16:09:45 +00:00
}
}
}
2007-06-09 20:46:40 +00:00
/* if fail_action was NULL, or text ended*/
return MATCH_FAILED ;
2007-06-09 16:09:45 +00:00
}
# endif