2006-09-12 19:38:39 +00:00
/*
* Match a string against a list of patterns / regexes .
*
2008-04-02 15:24:51 +00:00
* Copyright ( C ) 2007 - 2008 Sourcefire , Inc .
*
* Authors : Török Edvin
2006-09-12 19:38:39 +00:00
*
* This program is free software ; you can redistribute it and / or modify
2008-04-02 15:24:51 +00:00
* it under the terms of the GNU General Public License version 2 as
2007-06-30 11:50:56 +00:00
* published by the Free Software Foundation .
2006-09-12 19:38:39 +00:00
*
* This program is distributed in the hope that it will be useful ,
* but WITHOUT ANY WARRANTY ; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE . See the
* GNU General Public License for more details .
*
* You should have received a copy of the GNU General Public License
* along with this program ; if not , write to the Free Software
* Foundation , Inc . , 51 Franklin Street , Fifth Floor , Boston ,
* MA 02110 - 1301 , USA .
*/
# if HAVE_CONFIG_H
# include "clamav-config.h"
# endif
# ifndef CL_DEBUG
# define NDEBUG
# endif
# ifdef CL_THREAD_SAFE
# ifndef _REENTRANT
# define _REENTRANT
# endif
# endif
# include <stdio.h>
# include <stdlib.h>
# include <string.h>
# include <ctype.h>
2007-12-18 19:23:56 +00:00
# include <zlib.h>
2006-09-12 19:38:39 +00:00
# include <limits.h>
# include <sys/types.h>
2007-09-17 18:54:56 +00:00
# include "regex/regex.h"
2006-09-12 19:38:39 +00:00
# include "clamav.h"
# include "others.h"
# include "regex_list.h"
# include "matcher-ac.h"
2008-02-06 20:48:34 +00:00
# include "matcher.h"
2007-03-11 11:14:35 +00:00
# include "str.h"
2007-12-18 19:23:56 +00:00
# include "readdb.h"
2006-09-12 19:38:39 +00:00
/*Tree*/
enum token_op_t { OP_CHAR , OP_STDCLASS , OP_CUSTOMCLASS , OP_DOT , OP_LEAF , OP_ROOT , OP_PARCLOSE } ;
2006-10-10 23:51:49 +00:00
typedef unsigned char * char_bitmap_p ;
2006-09-12 19:38:39 +00:00
/*
*
* OP_CHAR : 1 character , c = character
* complex stuff :
* OP_STDCLASS : standard character class , c = char class , class : 1 < < ( index into std_class of class name )
* OP_CUSTOMCLASS : custom character class , first pointer in ptr array is a pointer to the bitmap table for this class
* OP_DOT : single . matching any character except \ n
* OP_LEAF : this is a leaf node , reinterpret structure
*/
struct tree_node {
2006-10-10 23:51:49 +00:00
struct tree_node * next ; /* next regex/complex sibling, or parent, if no more siblings , can't be NULL except for root node*/
2006-09-12 19:38:39 +00:00
union {
struct tree_node * * children ; /* alternatives nr. of children, followed by (a null pointer terminated) regex leaf node pointers) */
char_bitmap_p * bitmap ;
struct leaf_info * leaf ;
} u ;
2007-12-28 12:30:23 +00:00
enum token_op_t op ;
unsigned char c ;
char alternatives ; /* number of (non-regex) children of node, i.e. sizeof(children)*/
char listend ; /* no more siblings, next pointer is pointer to parent*/
2006-09-12 19:38:39 +00:00
} ;
struct leaf_info {
char * info ; /* what does it mean that we reached the leaf...*/
regex_t * preg ; /* this is NULL if leaf node, and non-regex*/
} ;
/* Character classes */
2006-10-10 23:51:49 +00:00
static const char * std_class [ ] = {
" [:alnum:] " ,
" [:digit:] " ,
" [:punct:] " ,
" [:alpha:] " ,
" [:graph:] " ,
" [:space:] " ,
" [:blank:] " ,
" [:lower:] " ,
" [:upper:] " ,
" [:cntrl:] " ,
" [:print:] " ,
" [:xdigit:] "
/* don't change the order of these strings, unless you change them in generate_tables.c too, and regenerate the tables*/
2006-09-12 19:38:39 +00:00
} ;
2006-10-07 11:00:46 +00:00
2006-09-12 19:38:39 +00:00
# define STD_CLASS_CNT sizeof(std_class) / sizeof(std_class[0])
2006-10-07 11:00:46 +00:00
2006-10-10 23:51:49 +00:00
/* generated by contrib/phishing/generate_tables.c */
static const unsigned char char_class_bitmap [ STD_CLASS_CNT ] [ 32 ] = {
{ 0x00 , 0x00 , 0x00 , 0x00 , 0x00 , 0x00 , 0xff , 0x03 ,
0xfe , 0xff , 0xff , 0x07 , 0xfe , 0xff , 0xff , 0x07 ,
0x00 , 0x00 , 0x00 , 0x00 , 0x00 , 0x00 , 0x00 , 0x00 ,
0x00 , 0x00 , 0x00 , 0x00 , 0x00 , 0x00 , 0x00 , 0x00 } ,
{ 0x00 , 0x00 , 0x00 , 0x00 , 0x00 , 0x00 , 0xff , 0x03 ,
0x00 , 0x00 , 0x00 , 0x00 , 0x00 , 0x00 , 0x00 , 0x00 ,
0x00 , 0x00 , 0x00 , 0x00 , 0x00 , 0x00 , 0x00 , 0x00 ,
0x00 , 0x00 , 0x00 , 0x00 , 0x00 , 0x00 , 0x00 , 0x00 } ,
{ 0x00 , 0x00 , 0x00 , 0x00 , 0xfe , 0xff , 0x00 , 0xfc ,
0x01 , 0x00 , 0x00 , 0xf8 , 0x01 , 0x00 , 0x00 , 0x78 ,
0x00 , 0x00 , 0x00 , 0x00 , 0x00 , 0x00 , 0x00 , 0x00 ,
0x00 , 0x00 , 0x00 , 0x00 , 0x00 , 0x00 , 0x00 , 0x00 } ,
{ 0x00 , 0x00 , 0x00 , 0x00 , 0x00 , 0x00 , 0x00 , 0x00 ,
0xfe , 0xff , 0xff , 0x07 , 0xfe , 0xff , 0xff , 0x07 ,
0x00 , 0x00 , 0x00 , 0x00 , 0x00 , 0x00 , 0x00 , 0x00 ,
0x00 , 0x00 , 0x00 , 0x00 , 0x00 , 0x00 , 0x00 , 0x00 } ,
{ 0x00 , 0x00 , 0x00 , 0x00 , 0xfe , 0xff , 0xff , 0xff ,
0xff , 0xff , 0xff , 0xff , 0xff , 0xff , 0xff , 0x7f ,
0x00 , 0x00 , 0x00 , 0x00 , 0x00 , 0x00 , 0x00 , 0x00 ,
0x00 , 0x00 , 0x00 , 0x00 , 0x00 , 0x00 , 0x00 , 0x00 } ,
{ 0x00 , 0x3e , 0x00 , 0x00 , 0x01 , 0x00 , 0x00 , 0x00 ,
0x00 , 0x00 , 0x00 , 0x00 , 0x00 , 0x00 , 0x00 , 0x00 ,
0x00 , 0x00 , 0x00 , 0x00 , 0x00 , 0x00 , 0x00 , 0x00 ,
0x00 , 0x00 , 0x00 , 0x00 , 0x00 , 0x00 , 0x00 , 0x00 } ,
{ 0x00 , 0x02 , 0x00 , 0x00 , 0x01 , 0x00 , 0x00 , 0x00 ,
0x00 , 0x00 , 0x00 , 0x00 , 0x00 , 0x00 , 0x00 , 0x00 ,
0x00 , 0x00 , 0x00 , 0x00 , 0x00 , 0x00 , 0x00 , 0x00 ,
0x00 , 0x00 , 0x00 , 0x00 , 0x00 , 0x00 , 0x00 , 0x00 } ,
{ 0x00 , 0x00 , 0x00 , 0x00 , 0x00 , 0x00 , 0x00 , 0x00 ,
0x00 , 0x00 , 0x00 , 0x00 , 0xfe , 0xff , 0xff , 0x07 ,
0x00 , 0x00 , 0x00 , 0x00 , 0x00 , 0x00 , 0x00 , 0x00 ,
0x00 , 0x00 , 0x00 , 0x00 , 0x00 , 0x00 , 0x00 , 0x00 } ,
{ 0x00 , 0x00 , 0x00 , 0x00 , 0x00 , 0x00 , 0x00 , 0x00 ,
0xfe , 0xff , 0xff , 0x07 , 0x00 , 0x00 , 0x00 , 0x00 ,
0x00 , 0x00 , 0x00 , 0x00 , 0x00 , 0x00 , 0x00 , 0x00 ,
0x00 , 0x00 , 0x00 , 0x00 , 0x00 , 0x00 , 0x00 , 0x00 } ,
{ 0xff , 0xff , 0xff , 0xff , 0x00 , 0x00 , 0x00 , 0x00 ,
0x00 , 0x00 , 0x00 , 0x00 , 0x00 , 0x00 , 0x00 , 0x80 ,
0x00 , 0x00 , 0x00 , 0x00 , 0x00 , 0x00 , 0x00 , 0x00 ,
0x00 , 0x00 , 0x00 , 0x00 , 0x00 , 0x00 , 0x00 , 0x00 } ,
{ 0x00 , 0x00 , 0x00 , 0x00 , 0xff , 0xff , 0xff , 0xff ,
0xff , 0xff , 0xff , 0xff , 0xff , 0xff , 0xff , 0x7f ,
0x00 , 0x00 , 0x00 , 0x00 , 0x00 , 0x00 , 0x00 , 0x00 ,
0x00 , 0x00 , 0x00 , 0x00 , 0x00 , 0x00 , 0x00 , 0x00 } ,
{ 0x00 , 0x00 , 0x00 , 0x00 , 0x00 , 0x00 , 0xff , 0x03 ,
0x7e , 0x00 , 0x00 , 0x00 , 0x7e , 0x00 , 0x00 , 0x00 ,
0x00 , 0x00 , 0x00 , 0x00 , 0x00 , 0x00 , 0x00 , 0x00 ,
0x00 , 0x00 , 0x00 , 0x00 , 0x00 , 0x00 , 0x00 , 0x00 }
} ;
2006-10-07 11:00:46 +00:00
2006-10-10 23:51:49 +00:00
static const unsigned short int char_class [ 256 ] = {
0x200 , 0x200 , 0x200 , 0x200 , 0x200 , 0x200 , 0x200 , 0x200 , 0x200 , 0x260 , 0x220 , 0x220 , 0x220 , 0x220 , 0x200 , 0x200 ,
0x200 , 0x200 , 0x200 , 0x200 , 0x200 , 0x200 , 0x200 , 0x200 , 0x200 , 0x200 , 0x200 , 0x200 , 0x200 , 0x200 , 0x200 , 0x200 ,
0x460 , 0x414 , 0x414 , 0x414 , 0x414 , 0x414 , 0x414 , 0x414 , 0x414 , 0x414 , 0x414 , 0x414 , 0x414 , 0x414 , 0x414 , 0x414 ,
0xc13 , 0xc13 , 0xc13 , 0xc13 , 0xc13 , 0xc13 , 0xc13 , 0xc13 , 0xc13 , 0xc13 , 0x414 , 0x414 , 0x414 , 0x414 , 0x414 , 0x414 ,
0x414 , 0xd19 , 0xd19 , 0xd19 , 0xd19 , 0xd19 , 0xd19 , 0x519 , 0x519 , 0x519 , 0x519 , 0x519 , 0x519 , 0x519 , 0x519 , 0x519 ,
0x519 , 0x519 , 0x519 , 0x519 , 0x519 , 0x519 , 0x519 , 0x519 , 0x519 , 0x519 , 0x519 , 0x414 , 0x414 , 0x414 , 0x414 , 0x414 ,
0x414 , 0xc99 , 0xc99 , 0xc99 , 0xc99 , 0xc99 , 0xc99 , 0x499 , 0x499 , 0x499 , 0x499 , 0x499 , 0x499 , 0x499 , 0x499 , 0x499 ,
0x499 , 0x499 , 0x499 , 0x499 , 0x499 , 0x499 , 0x499 , 0x499 , 0x499 , 0x499 , 0x499 , 0x414 , 0x414 , 0x414 , 0x414 , 0x200 ,
0x000 , 0x000 , 0x000 , 0x000 , 0x000 , 0x000 , 0x000 , 0x000 , 0x000 , 0x000 , 0x000 , 0x000 , 0x000 , 0x000 , 0x000 , 0x000 ,
0x000 , 0x000 , 0x000 , 0x000 , 0x000 , 0x000 , 0x000 , 0x000 , 0x000 , 0x000 , 0x000 , 0x000 , 0x000 , 0x000 , 0x000 , 0x000 ,
0x000 , 0x000 , 0x000 , 0x000 , 0x000 , 0x000 , 0x000 , 0x000 , 0x000 , 0x000 , 0x000 , 0x000 , 0x000 , 0x000 , 0x000 , 0x000 ,
0x000 , 0x000 , 0x000 , 0x000 , 0x000 , 0x000 , 0x000 , 0x000 , 0x000 , 0x000 , 0x000 , 0x000 , 0x000 , 0x000 , 0x000 , 0x000 ,
0x000 , 0x000 , 0x000 , 0x000 , 0x000 , 0x000 , 0x000 , 0x000 , 0x000 , 0x000 , 0x000 , 0x000 , 0x000 , 0x000 , 0x000 , 0x000 ,
0x000 , 0x000 , 0x000 , 0x000 , 0x000 , 0x000 , 0x000 , 0x000 , 0x000 , 0x000 , 0x000 , 0x000 , 0x000 , 0x000 , 0x000 , 0x000 ,
0x000 , 0x000 , 0x000 , 0x000 , 0x000 , 0x000 , 0x000 , 0x000 , 0x000 , 0x000 , 0x000 , 0x000 , 0x000 , 0x000 , 0x000 , 0x000 ,
0x000 , 0x000 , 0x000 , 0x000 , 0x000 , 0x000 , 0x000 , 0x000 , 0x000 , 0x000 , 0x000 , 0x000 , 0x000 , 0x000 , 0x000 , 0x000
} ;
2006-09-12 19:38:39 +00:00
2006-10-10 23:51:49 +00:00
static const size_t std_class_cnt = sizeof ( std_class ) / sizeof ( std_class [ 0 ] ) ;
2006-10-07 11:00:46 +00:00
2006-09-12 19:38:39 +00:00
/* Prototypes */
2007-03-18 23:27:15 +00:00
static int add_pattern ( struct regex_matcher * matcher , const unsigned char * pat , const char * info , int hostOnly ) ;
2006-09-12 19:38:39 +00:00
static int match_node ( struct tree_node * node , const unsigned char * c , size_t len , const char * * info ) ;
static void destroy_tree ( struct regex_matcher * matcher ) ;
2006-10-10 23:51:49 +00:00
static struct tree_node * tree_root_alloc ( void ) ;
static int build_regex_list ( struct regex_matcher * matcher ) ;
static void stack_destroy ( struct node_stack * stack ) ;
2006-09-12 19:38:39 +00:00
2006-10-10 23:51:49 +00:00
# ifndef NDEBUG
void dump_tree ( struct tree_node * root ) ;
# endif
2006-09-12 19:38:39 +00:00
# define MATCH_SUCCESS 0
# define MATCH_FAILED -1
/*
* Call this function when an unrecoverable error has occured , ( instead of exit ) .
*/
static void fatal_error ( struct regex_matcher * matcher )
{
regex_list_done ( matcher ) ;
matcher - > list_inited = - 1 ; /* the phishing module will know we tried to load a whitelist, and failed, so it will disable itself too*/
}
2007-09-27 21:27:37 +00:00
static inline size_t get_char_at_pos_with_skip ( const struct pre_fixup_info * info , const char * buffer , size_t pos )
{
2007-09-30 21:00:05 +00:00
const char * str ;
2007-09-28 18:45:58 +00:00
size_t realpos = 0 ;
2007-09-27 21:27:37 +00:00
if ( ! info ) {
2007-09-30 21:00:05 +00:00
return ( pos < = strlen ( buffer ) ) ? buffer [ pos > 0 ? pos - 1 : 0 ] : ' \0 ' ;
2007-09-27 21:27:37 +00:00
}
2007-09-30 21:00:05 +00:00
str = info - > pre_displayLink . data ;
2007-10-06 08:47:46 +00:00
cli_dbgmsg ( " calc_pos_with_skip: skip:%lu, %lu - %lu \" %s \" , \" %s \" \n " , pos , info - > host_start , info - > host_end , str , buffer ) ;
2007-09-28 18:45:58 +00:00
pos + = info - > host_start ;
2007-09-30 21:00:05 +00:00
while ( str [ realpos ] & & ! isalnum ( str [ realpos ] ) ) realpos + + ;
for ( ; str [ realpos ] & & ( pos > 0 ) ; pos - - ) {
while ( str [ realpos ] = = ' ' ) realpos + + ;
2007-09-27 21:27:37 +00:00
realpos + + ;
}
2007-09-30 21:00:05 +00:00
while ( str [ realpos ] = = ' ' ) realpos + + ;
2008-01-30 18:44:07 +00:00
cli_dbgmsg ( " calc_pos_with_skip:%s \n " , str + realpos ) ;
2007-09-30 21:00:05 +00:00
return ( pos > 0 & & ! str [ realpos ] ) ? ' \0 ' : str [ realpos > 0 ? realpos - 1 : 0 ] ;
2007-09-27 21:27:37 +00:00
}
2006-09-12 19:38:39 +00:00
/*
* @ matcher - matcher structure to use
* @ real_url - href target
* @ display_url - < a > tag contents
* @ hostOnly - if you want to match only the host part
2006-10-10 23:51:49 +00:00
* @ is_whitelist - is this a lookup in whitelist ?
2006-09-12 19:38:39 +00:00
*
* @ return - CL_SUCCESS - url doesn ' t match
* - CL_VIRUS - url matches list
*
* Do not send NULL pointers to this function ! !
*
*/
2007-10-03 12:53:09 +00:00
int regex_list_match ( struct regex_matcher * matcher , char * real_url , const char * display_url , const struct pre_fixup_info * pre_fixup , int hostOnly , const char * * info , int is_whitelist )
2006-09-12 19:38:39 +00:00
{
2008-01-30 18:44:07 +00:00
char * orig_real_url = real_url ;
2006-10-10 23:51:49 +00:00
massert ( matcher ) ;
massert ( real_url ) ;
massert ( display_url ) ;
massert ( info ) ;
2006-09-12 19:38:39 +00:00
if ( ! matcher - > list_inited )
return 0 ;
2006-10-10 23:51:49 +00:00
massert ( matcher - > list_built ) ;
2008-01-30 18:44:07 +00:00
/* skip initial '.' inserted by get_host */
if ( real_url [ 0 ] = = ' . ' ) real_url + + ;
if ( display_url [ 0 ] = = ' . ' ) display_url + + ;
2006-09-12 19:38:39 +00:00
{
size_t real_len = strlen ( real_url ) ;
size_t display_len = strlen ( display_url ) ;
2006-12-19 20:47:45 +00:00
size_t buffer_len = ( hostOnly & & ! is_whitelist ) ? real_len : real_len + display_len + 1 + ( is_whitelist ? 1 : 0 ) ;
2006-09-12 19:38:39 +00:00
char * buffer = cli_malloc ( buffer_len + 1 ) ;
2006-10-15 19:16:33 +00:00
size_t i ;
2006-11-15 15:26:54 +00:00
int rc = 0 ;
struct cli_ac_data mdata ;
2006-09-12 19:38:39 +00:00
if ( ! buffer )
return CL_EMEM ;
strncpy ( buffer , real_url , real_len ) ;
2006-10-28 15:34:40 +00:00
buffer [ real_len ] = ( ! is_whitelist & & hostOnly ) ? ' \0 ' : ' : ' ;
2006-10-10 23:51:49 +00:00
if ( ! hostOnly | | is_whitelist ) {
2006-09-27 19:16:11 +00:00
strncpy ( buffer + real_len + 1 , display_url , display_len ) ;
2008-01-30 18:44:07 +00:00
if ( is_whitelist )
2006-12-19 20:47:45 +00:00
buffer [ buffer_len - 1 ] = ' / ' ;
2006-09-27 19:16:11 +00:00
buffer [ buffer_len ] = 0 ;
2006-09-16 15:49:27 +00:00
}
2006-09-27 19:16:11 +00:00
cli_dbgmsg ( " Looking up in regex_list: %s \n " , buffer ) ;
2006-09-12 19:38:39 +00:00
2006-11-15 15:26:54 +00:00
if ( hostOnly ) {
if ( ( rc = cli_ac_initdata ( & mdata , 0 , AC_DEFAULT_TRACKLEN ) ) )
return rc ;
rc = 0 ;
2006-10-15 19:16:33 +00:00
for ( i = 0 ; i < matcher - > root_hosts_cnt ; i + + ) {
2007-09-30 21:00:05 +00:00
/* doesn't need to match terminating \0*/
2008-02-22 00:26:25 +00:00
rc = cli_ac_scanbuff ( ( unsigned char * ) buffer , buffer_len , info , & matcher - > root_hosts [ i ] , & mdata , 0 , 0 , - 1 , NULL , AC_SCAN_VIR , NULL ) ;
2007-06-16 16:36:22 +00:00
cli_ac_freedata ( & mdata ) ;
2007-06-26 07:40:02 +00:00
if ( rc ) {
2007-09-27 21:27:37 +00:00
char c ;
2008-01-30 18:44:07 +00:00
const char * matched = strchr ( * info , ' : ' ) ;
2007-06-26 07:40:02 +00:00
const size_t match_len = matched ? strlen ( matched + 1 ) : 0 ;
2007-09-30 21:00:05 +00:00
if ( ( ( c = get_char_at_pos_with_skip ( pre_fixup , buffer , buffer_len + 1 ) ) = = ' ' | | c = = ' \0 ' | | c = = ' / ' | | c = = ' ? ' ) & &
( match_len = = buffer_len | | /* full match */
2007-06-26 07:40:02 +00:00
( match_len < buffer_len & &
2008-01-30 18:44:07 +00:00
( ( c = get_char_at_pos_with_skip ( pre_fixup , buffer , buffer_len - match_len ) ) = = ' . ' | | ( c = = ' ' ) ) )
2007-09-30 21:00:05 +00:00
/* subdomain matched*/ ) ) {
2007-06-26 07:40:02 +00:00
2008-01-30 18:44:07 +00:00
cli_dbgmsg ( " Got a match: %s with %s \n " , buffer , * info ) ;
cli_dbgmsg ( " Before inserting .: %s \n " , orig_real_url ) ;
2007-10-03 12:53:09 +00:00
if ( real_len > = match_len + 1 ) {
2008-01-30 18:44:07 +00:00
const size_t pos = real_len - match_len - 1 ;
if ( real_url [ pos ] ! = ' . ' ) {
/* we need to shift left, and insert a '.'
* we have an extra ' . ' at the beginning inserted by get_host to have room ,
* orig_real_url has to be used here ,
* because we want to overwrite that extra ' . ' */
size_t orig_real_len = strlen ( orig_real_url ) ;
2008-02-06 12:24:58 +00:00
cli_dbgmsg ( " No dot here:%s \n " , real_url + pos ) ;
2008-01-30 18:44:07 +00:00
real_url = orig_real_url ;
memmove ( real_url , real_url + 1 , orig_real_len - match_len - 1 ) ;
real_url [ orig_real_len - match_len - 1 ] = ' . ' ;
cli_dbgmsg ( " After inserting .: %s \n " , real_url ) ;
}
2007-10-03 12:53:09 +00:00
}
2007-06-26 07:40:02 +00:00
break ;
}
2008-01-30 18:44:07 +00:00
cli_dbgmsg ( " Ignoring false match: %s with %s, mismatched character: %c \n " , buffer , * info , c ) ;
2007-06-26 07:40:02 +00:00
rc = 0 ;
}
2006-10-15 19:16:33 +00:00
}
2006-11-15 15:26:54 +00:00
} else
2006-10-15 19:16:33 +00:00
rc = 0 ;
2008-01-30 18:44:07 +00:00
if ( ! rc )
2007-03-18 23:27:15 +00:00
rc = match_node ( hostOnly ? matcher - > root_regex_hostonly : matcher - > root_regex , ( unsigned char * ) buffer , buffer_len , info ) = = MATCH_SUCCESS ? CL_VIRUS : CL_SUCCESS ;
2006-09-12 19:38:39 +00:00
free ( buffer ) ;
if ( ! rc )
2007-04-28 20:15:22 +00:00
cli_dbgmsg ( " Lookup result: not in regex list \n " ) ;
else
cli_dbgmsg ( " Lookup result: in regex list \n " ) ;
2006-09-12 19:38:39 +00:00
return rc ;
}
}
/* node stack */
# define NODE_STACK_INITIAL 1024
# define NODE_STACK_GROW 4096
/* Initialize @stack */
static int stack_init ( struct node_stack * stack )
{
2006-10-10 23:51:49 +00:00
massert ( stack ) ;
2006-09-12 19:38:39 +00:00
stack - > cnt = 0 ;
stack - > capacity = NODE_STACK_INITIAL ;
stack - > data = cli_malloc ( stack - > capacity * sizeof ( * stack - > data ) ) ;
if ( ! stack - > data )
return CL_EMEM ;
else
return CL_SUCCESS ;
}
/* Reset @stack pointer, but don't realloc */
static void stack_reset ( struct node_stack * stack )
{
2006-10-10 23:51:49 +00:00
massert ( stack ) ;
2006-09-12 19:38:39 +00:00
stack - > cnt = 0 ;
}
/* Push @node on @stack, growing it if necessarry */
2007-03-11 11:14:35 +00:00
static int stack_push ( struct node_stack * stack , struct tree_node * node )
2006-09-12 19:38:39 +00:00
{
2006-10-10 23:51:49 +00:00
massert ( stack ) ;
massert ( stack - > data ) ;
2006-09-12 19:38:39 +00:00
if ( stack - > cnt = = stack - > capacity ) {
stack - > capacity + = NODE_STACK_GROW ;
2007-05-25 23:10:58 +00:00
stack - > data = cli_realloc2 ( stack - > data , stack - > capacity * sizeof ( * stack - > data ) ) ;
2006-09-12 19:38:39 +00:00
if ( ! stack - > data )
return CL_EMEM ;
}
stack - > data [ stack - > cnt + + ] = node ;
return CL_SUCCESS ;
}
/* Pops node from @stack, doesn't realloc */
2007-03-11 11:14:35 +00:00
static struct tree_node * stack_pop ( struct node_stack * stack )
2006-09-12 19:38:39 +00:00
{
2006-10-10 23:51:49 +00:00
massert ( stack ) ;
massert ( stack - > data ) ;
massert ( stack - > cnt ) ; /*don't pop from empty stack */
2006-09-12 19:38:39 +00:00
return stack - > cnt ? stack - > data [ - - stack - > cnt ] : NULL ;
}
/* Initialization & loading */
/* Initializes @matcher, allocating necesarry substructures */
int init_regex_list ( struct regex_matcher * matcher )
{
2006-10-15 19:16:33 +00:00
int rc ;
2006-10-10 23:51:49 +00:00
massert ( matcher ) ;
2006-09-12 19:38:39 +00:00
matcher - > list_inited = 0 ;
2006-10-15 19:16:33 +00:00
matcher - > root_hosts_cnt = 0 ;
matcher - > root_hosts = NULL ;
matcher - > root_hosts_cnt = 0 ;
2006-09-12 19:38:39 +00:00
matcher - > root_regex = tree_root_alloc ( ) ;
if ( ! matcher - > root_regex ) {
return CL_EMEM ;
}
2007-03-18 23:27:15 +00:00
matcher - > root_regex_hostonly = tree_root_alloc ( ) ;
if ( ! matcher - > root_regex_hostonly ) {
free ( matcher - > root_regex ) ;
return CL_EMEM ;
}
2006-10-15 19:16:33 +00:00
if ( ( rc = stack_init ( & matcher - > node_stack ) ) ) {
2007-03-18 23:27:15 +00:00
free ( matcher - > root_regex_hostonly ) ;
2006-10-15 19:16:33 +00:00
free ( matcher - > root_regex ) ;
return rc ;
}
if ( ( rc = stack_init ( & matcher - > node_stack_alt ) ) ) {
2007-03-18 23:27:15 +00:00
free ( matcher - > root_regex_hostonly ) ;
2006-10-15 19:16:33 +00:00
free ( matcher - > root_regex ) ;
stack_destroy ( & matcher - > node_stack ) ;
return rc ;
}
2006-09-12 19:38:39 +00:00
matcher - > list_inited = 1 ;
2006-10-15 19:16:33 +00:00
matcher - > list_built = 1 ; /* its empty, but pretend its built, so that load_ will realloc root_hosts */
2006-09-12 19:38:39 +00:00
matcher - > list_loaded = 0 ;
return CL_SUCCESS ;
}
/* inserts @pattern into @root, using ac-matcher
* although the name might be confusing , @ pattern is not a regex ! */
static int add_regex_list_element ( struct cli_matcher * root , const char * pattern , char * info )
{
2006-10-10 23:51:49 +00:00
int ret ;
2006-09-12 19:38:39 +00:00
struct cli_ac_patt * new = cli_calloc ( 1 , sizeof ( * new ) ) ;
2006-10-10 23:51:49 +00:00
size_t len , i ;
2006-09-12 19:38:39 +00:00
if ( ! new )
return CL_EMEM ;
2006-10-10 23:51:49 +00:00
massert ( root ) ;
massert ( pattern ) ;
2006-09-12 19:38:39 +00:00
2007-09-30 21:00:05 +00:00
len = strlen ( pattern ) ;
/* need not to match \0 too */
2008-02-20 22:04:48 +00:00
new - > rtype = 0 ;
2006-09-12 19:38:39 +00:00
new - > type = 0 ;
new - > sigid = 0 ;
new - > parts = 0 ;
new - > partno = 0 ;
new - > mindist = 0 ;
new - > maxdist = 0 ;
new - > offset = 0 ;
new - > target = 0 ;
new - > length = len ;
2008-02-06 20:48:34 +00:00
new - > ch [ 0 ] = new - > ch [ 1 ] | = CLI_MATCH_IGNORE ;
2006-09-12 19:38:39 +00:00
if ( new - > length > root - > maxpatlen )
root - > maxpatlen = new - > length ;
new - > pattern = cli_malloc ( sizeof ( new - > pattern [ 0 ] ) * len ) ;
if ( ! new - > pattern ) {
free ( new ) ;
return CL_EMEM ;
}
2006-09-16 15:49:27 +00:00
for ( i = 0 ; i < len ; i + + )
new - > pattern [ i ] = pattern [ i ] ; /*new->pattern is short int* */
2006-09-12 19:38:39 +00:00
2007-06-26 07:40:02 +00:00
2007-03-11 11:14:35 +00:00
new - > virname = cli_strdup ( info ) ;
2006-09-12 19:38:39 +00:00
if ( ( ret = cli_ac_addpatt ( root , new ) ) ) {
free ( new - > virname ) ;
free ( new - > pattern ) ;
free ( new ) ;
return ret ;
}
return CL_SUCCESS ;
}
2006-12-19 20:30:17 +00:00
static int functionality_level_check ( char * line )
2006-12-02 00:42:44 +00:00
{
char * ptmin ;
char * ptmax ;
size_t j ;
ptmin = strrchr ( line , ' : ' ) ;
if ( ! ptmin )
return CL_SUCCESS ;
ptmin + + ;
ptmax = strchr ( ptmin , ' - ' ) ;
if ( ! ptmax )
return CL_SUCCESS ; /* there is no functionality level specified, so we're ok */
else {
2006-12-19 20:30:17 +00:00
size_t min , max ;
2006-12-02 00:42:44 +00:00
ptmax + + ;
2006-12-19 20:30:17 +00:00
for ( j = 0 ; j + ptmin + 1 < ptmax ; j + + )
2006-12-02 00:42:44 +00:00
if ( ! isdigit ( ptmin [ j ] ) )
return CL_SUCCESS ; /* not numbers, not functionality level */
for ( j = 0 ; j < strlen ( ptmax ) ; j + + )
if ( ! isdigit ( ptmax [ j ] ) )
return CL_SUCCESS ; /* see above */
ptmax [ - 1 ] = ' \0 ' ;
min = atoi ( ptmin ) ;
if ( strlen ( ptmax ) = = 0 )
max = INT_MAX ;
else
max = atoi ( ptmax ) ;
if ( min > cl_retflevel ( ) ) {
2007-03-11 11:14:35 +00:00
cli_dbgmsg ( " regex list line %s not loaded (required f-level: %u) \n " , line , ( unsigned int ) min ) ;
2006-12-02 00:42:44 +00:00
return CL_EMALFDB ;
}
if ( max < cl_retflevel ( ) )
return CL_EMALFDB ;
ptmin [ - 1 ] = ' \0 ' ;
return CL_SUCCESS ;
}
}
2006-09-12 19:38:39 +00:00
/* Load patterns/regexes from file */
2008-05-18 21:32:27 +00:00
int load_regex_matcher ( struct regex_matcher * matcher , FILE * fd , unsigned int options , int is_whitelist , struct cli_dbio * dbio )
2006-09-12 19:38:39 +00:00
{
int rc , line = 0 ;
char buffer [ FILEBUFF ] ;
2006-10-10 23:51:49 +00:00
massert ( matcher ) ;
2006-09-12 19:38:39 +00:00
if ( matcher - > list_inited = = - 1 )
2006-10-10 23:51:49 +00:00
return CL_EMALFDB ; /* already failed to load */
2006-10-15 19:16:33 +00:00
/* if(matcher->list_loaded) {
2006-09-12 19:38:39 +00:00
cli_warnmsg ( " Regex list has already been loaded, ignoring further requests for load \n " ) ;
2006-10-10 23:51:49 +00:00
return CL_SUCCESS ;
2006-10-15 19:16:33 +00:00
} */
2008-05-18 21:32:27 +00:00
if ( ! fd & & ! dbio ) {
2006-09-12 19:38:39 +00:00
cli_errmsg ( " Unable to load regex list (null file) \n " ) ;
2006-10-10 23:51:49 +00:00
return CL_EIO ;
2006-09-12 19:38:39 +00:00
}
cli_dbgmsg ( " Loading regex_list \n " ) ;
if ( ! matcher - > list_inited ) {
2006-10-10 23:51:49 +00:00
rc = init_regex_list ( matcher ) ;
2006-09-12 19:38:39 +00:00
if ( ! matcher - > list_inited ) {
cli_errmsg ( " Regex list failed to initialize! \n " ) ;
fatal_error ( matcher ) ;
2006-10-10 23:51:49 +00:00
return rc ;
2006-09-12 19:38:39 +00:00
}
/*atexit(regex_list_done); TODO: destroy this in manager.c */
}
/*
* Regexlist db format ( common to . wdb ( whitelist ) and . pdb ( domainlist ) files :
* Multiple lines of form , ( empty lines are skipped ) :
* Flags RealURL DisplayedURL
* Where :
2007-03-18 23:27:15 +00:00
* Flags :
*
* . pdb files :
* R - regex , H - host - only , followed by ( optional ) 3 - digit hexnumber representing
2006-09-12 19:38:39 +00:00
* flags that should be filtered .
* [ i . e . phishcheck urls . flags that we don ' t want to be done for this particular host ]
2007-03-18 23:27:15 +00:00
*
* . wdb files :
* X - full URL regex
* Y - host - only regex
* M - host simple pattern
2006-09-12 19:38:39 +00:00
*
* If a line in the file doesn ' t conform to this format , loading fails
*
*/
2008-05-18 21:32:27 +00:00
while ( cli_dbgets ( buffer , FILEBUFF , fd , dbio ) ) {
2006-09-12 19:38:39 +00:00
char * pattern ;
char * flags ;
cli_chomp ( buffer ) ;
if ( ! * buffer )
continue ; /* skip empty lines */
2006-12-02 00:42:44 +00:00
if ( functionality_level_check ( buffer ) )
continue ;
line + + ;
2006-10-28 15:34:40 +00:00
pattern = strchr ( buffer , ' : ' ) ;
2006-09-12 19:38:39 +00:00
if ( ! pattern ) {
cli_errmsg ( " Malformed regex list line %d \n " , line ) ;
fatal_error ( matcher ) ;
return CL_EMALFDB ;
}
2007-06-26 07:40:02 +00:00
/*pattern[0]='\0';*/
2006-10-10 23:51:49 +00:00
flags = buffer + 1 ;
2006-09-12 19:38:39 +00:00
pattern + + ;
2006-12-19 20:47:45 +00:00
if ( is_whitelist ) {
const size_t pattern_len = strlen ( pattern ) ;
if ( pattern_len < FILEBUFF ) {
pattern [ pattern_len ] = ' / ' ;
pattern [ pattern_len + 1 ] = ' \0 ' ;
}
else {
cli_errmsg ( " Overlong regex line %d \n " , line ) ;
fatal_error ( matcher ) ;
return CL_EMALFDB ;
}
}
2007-03-18 23:27:15 +00:00
if ( ( buffer [ 0 ] = = ' R ' & & ! is_whitelist ) | | ( ( buffer [ 0 ] = = ' X ' | | buffer [ 0 ] = = ' Y ' ) & & is_whitelist ) ) { /*regex*/
if ( ( rc = add_pattern ( matcher , ( const unsigned char * ) pattern , flags , buffer [ 0 ] = = ' Y ' ) ) )
2006-09-12 19:38:39 +00:00
return rc = = CL_EMEM ? CL_EMEM : CL_EMALFDB ;
}
2006-10-15 19:16:33 +00:00
else if ( ( buffer [ 0 ] = = ' H ' & & ! is_whitelist ) | | ( buffer [ 0 ] = = ' M ' & & is_whitelist ) ) { /*matches displayed host*/
2007-04-28 20:15:22 +00:00
struct cli_matcher * root ;
2006-10-15 19:16:33 +00:00
if ( matcher - > list_built ) {
struct cli_matcher * old_hosts = matcher - > root_hosts ;
matcher - > root_hosts_cnt + + ;
2007-05-26 10:22:18 +00:00
matcher - > root_hosts = cli_realloc ( matcher - > root_hosts , matcher - > root_hosts_cnt * sizeof ( * matcher - > root_hosts ) ) ;
2006-10-15 19:16:33 +00:00
if ( ! matcher - > root_hosts ) {
matcher - > root_hosts = old_hosts ; /* according to manpage this must still be valid*/
return CL_EMEM ;
2007-03-11 11:14:35 +00:00
}
2007-04-28 20:15:22 +00:00
root = & matcher - > root_hosts [ matcher - > root_hosts_cnt - 1 ] ;
memset ( root , 0 , sizeof ( struct cli_matcher ) ) ;
cli_dbgmsg ( " regex_list: Initialising AC pattern matcher \n " ) ;
2007-11-08 15:17:08 +00:00
if ( ( rc = cli_ac_init ( root , cli_ac_mindepth , cli_ac_maxdepth ) ) ) {
2007-04-28 20:15:22 +00:00
/* no need to free previously allocated memory here */
cli_errmsg ( " regex_list: Can't initialise AC pattern matcher \n " ) ;
return rc ;
}
2006-10-15 19:16:33 +00:00
matcher - > list_built = 0 ;
}
2007-04-28 20:15:22 +00:00
else {
root = & matcher - > root_hosts [ matcher - > root_hosts_cnt - 1 ] ;
}
if ( ( rc = add_regex_list_element ( root , pattern , flags ) ) )
2006-09-12 19:38:39 +00:00
return rc = = CL_EMEM ? CL_EMEM : CL_EMALFDB ;
}
else {
2006-10-10 23:51:49 +00:00
return CL_EMALFDB ;
/* this is useless, we have host, and regex matches
2006-09-12 19:38:39 +00:00
if ( ( rc = add_regex_list_element ( matcher - > root_urls , pattern , flags ) ) )
2006-10-10 23:51:49 +00:00
return rc = = CL_EMEM ? CL_EMEM : CL_EMALFDB ; */
2006-09-12 19:38:39 +00:00
}
}
matcher - > list_loaded = 1 ;
2006-10-15 19:16:33 +00:00
if ( ( rc = build_regex_list ( matcher ) ) )
return rc ;
2006-09-12 19:38:39 +00:00
# ifndef NDEBUG
/* dump_tree(matcher->root_regex);*/
# endif
if ( ! matcher - > list_built ) {
cli_errmsg ( " Regex list not loaded: build failed! \n " ) ;
fatal_error ( matcher ) ;
return CL_EMALFDB ;
}
regex_list_cleanup ( matcher ) ;
return CL_SUCCESS ;
}
static struct tree_node * * tree_node_get_children ( const struct tree_node * node )
{
return node - > op = = OP_CUSTOMCLASS ? ( node - > u . children [ 1 ] ? node - > u . children + 1 : NULL ) : node - > u . children ;
}
2006-10-10 23:51:49 +00:00
2006-09-12 19:38:39 +00:00
/* Build the matcher list */
static int build_regex_list ( struct regex_matcher * matcher )
{
2006-10-15 19:16:33 +00:00
int rc ;
2006-09-12 19:38:39 +00:00
if ( ! matcher - > list_inited | | ! matcher - > list_loaded ) {
cli_errmsg ( " Regex list not loaded! \n " ) ;
return - 1 ; /*TODO: better error code */
}
cli_dbgmsg ( " Building regex list \n " ) ;
2006-10-30 17:53:03 +00:00
if ( matcher - > root_hosts )
if ( ( rc = cli_ac_buildtrie ( & matcher - > root_hosts [ matcher - > root_hosts_cnt - 1 ] ) ) )
2006-10-15 19:16:33 +00:00
return rc ;
2006-09-12 19:38:39 +00:00
matcher - > list_built = 1 ;
return CL_SUCCESS ;
}
/* Done with this matcher, free resources */
void regex_list_done ( struct regex_matcher * matcher )
{
2006-10-10 23:51:49 +00:00
massert ( matcher ) ;
2006-09-12 19:38:39 +00:00
regex_list_cleanup ( matcher ) ;
if ( matcher - > list_loaded ) {
2006-10-14 23:52:02 +00:00
if ( matcher - > root_hosts ) {
2006-10-15 19:16:33 +00:00
size_t i ;
2007-04-28 20:15:22 +00:00
for ( i = 0 ; i < matcher - > root_hosts_cnt ; i + + )
2006-10-15 19:16:33 +00:00
cli_ac_free ( & matcher - > root_hosts [ i ] ) ;
2006-10-14 23:52:02 +00:00
free ( matcher - > root_hosts ) ;
matcher - > root_hosts = NULL ;
}
2006-09-12 19:38:39 +00:00
2006-10-15 19:16:33 +00:00
matcher - > root_hosts_cnt = 0 ;
2006-09-12 19:38:39 +00:00
matcher - > list_built = 0 ;
destroy_tree ( matcher ) ;
matcher - > list_loaded = 0 ;
}
if ( matcher - > list_inited ) {
matcher - > list_inited = 0 ;
}
stack_destroy ( & matcher - > node_stack ) ;
stack_destroy ( & matcher - > node_stack_alt ) ;
}
/* Tree matcher algorithm */
struct token_t
{
union {
2006-10-10 23:51:49 +00:00
const unsigned char * start ;
char_bitmap_p bitmap ;
unsigned char c ;
2006-09-12 19:38:39 +00:00
} u ;
2006-10-10 23:51:49 +00:00
size_t len ;
char type ;
2006-09-12 19:38:39 +00:00
} ;
enum { TOKEN_CHAR , TOKEN_DOT , TOKEN_PAR_OPEN , TOKEN_PAR_CLOSE , TOKEN_BRACKET , TOKEN_ALT , TOKEN_REGEX , TOKEN_DONE } ;
static const unsigned char * getNextToken ( const unsigned char * pat , struct token_t * token )
{
2006-10-10 23:51:49 +00:00
massert ( pat ) ;
massert ( token ) ;
2006-09-12 19:38:39 +00:00
switch ( * pat ) {
case ' \\ ' :
token - > type = TOKEN_CHAR ;
2006-10-10 23:51:49 +00:00
token - > u . c = * ( + + pat ) ;
if ( islower ( token - > u . c ) ) {
2006-09-12 19:38:39 +00:00
/* handle \n, \t, etc. */
2006-09-25 18:29:02 +00:00
char fmt [ 3 ] = { ' \\ ' , ' \0 ' , ' \0 ' } ;
2006-09-12 19:38:39 +00:00
char c ;
2006-09-25 18:29:02 +00:00
2006-10-10 23:51:49 +00:00
fmt [ 1 ] = token - > u . c ;
if ( snprintf ( & c , 1 , fmt ) ! = 1 ) {
2006-09-12 19:38:39 +00:00
token - > type = TOKEN_REGEX ;
2006-10-10 23:51:49 +00:00
token - > u . start = pat ;
}
2006-09-24 19:28:03 +00:00
else
2006-10-10 23:51:49 +00:00
token - > u . c = c ;
2006-09-12 19:38:39 +00:00
}
2006-09-25 18:29:02 +00:00
token - > len = 1 ;
2006-09-12 19:38:39 +00:00
break ;
case ' | ' :
token - > type = TOKEN_ALT ;
break ;
case ' * ' :
case ' + ' :
case ' ? ' :
case ' { ' :
case ' } ' :
token - > type = TOKEN_REGEX ;
break ;
case ' [ ' :
{
/*TODO: implement*/
/*see if it is something simple like a list of characters, a range, or negated ...*/
const unsigned char * old = pat + + ; /* save this in case we change our mind and decide this is too complicated for us to handle*/
unsigned char range_start = 0 ;
int hasprev = 0 ;
char_bitmap_p bitmap = cli_malloc ( 32 ) ;
if ( ! bitmap )
return NULL ;
if ( * pat = = ' ^ ' ) {
memset ( bitmap , 0xFF , 32 ) ; /*match chars not in brackets*/
pat + + ;
}
else
memset ( bitmap , 0x00 , 32 ) ;
do {
/* literal ] can be first character, so test for it at the end of the loop, for example: []] */
if ( * pat = = ' - ' & & hasprev ) {
/* it is a range*/
unsigned char range_end ;
unsigned int c ;
2006-10-10 23:51:49 +00:00
massert ( range_start ) ;
2006-09-12 19:38:39 +00:00
pat + + ;
if ( pat [ 0 ] = = ' [ ' )
if ( pat [ 1 ] = = ' . ' ) {
if ( pat [ 2 ] = = ' - ' & & pat [ 3 ] = = ' . ' & & pat [ 4 ] = = ' ] ' )
range_end = ' - ' ;
else {
/* this is getting complicated, bail out */
cli_warnmsg ( " confused about collating sequences in regex,bailing out " ) ;
pat = old ;
token - > type = TOKEN_REGEX ;
break ;
}
}
else
range_end = * pat ;
else
range_end = * pat ;
for ( c = range_start + 1 ; c < = range_end ; c + + )
bitmap [ c > > 3 ] ^ = 1 < < ( c & 0x7 ) ;
hasprev = 0 ;
}
else if ( pat [ 0 ] = = ' [ ' & & pat [ 1 ] = = ' : ' ) {
const unsigned char * end ;
int len , found = - 1 ;
size_t i ;
pat + = 2 ;
end = ( unsigned char * ) strstr ( ( const char * ) pat , " :] " ) ;
if ( ! end ) {
cli_warnmsg ( " confused about std char class syntax regex,bailing out " ) ;
pat = old ;
token - > type = TOKEN_REGEX ;
break ;
}
len = end - pat ;
for ( i = 0 ; i < std_class_cnt ; i + + )
2006-10-10 23:51:49 +00:00
if ( ! strncmp ( ( const char * ) pat , std_class [ i ] , len ) ) {
2006-09-12 19:38:39 +00:00
found = i ;
break ;
}
if ( found ! = - 1 ) {
for ( i = 0 ; i < 256 ; i + + )
if ( char_class [ i ] & ( 1 < < found ) )
bitmap [ i > > 3 ] ^ = 1 < < ( i & 0x7 ) ;
}
else {
/*unknown class*/
cli_warnmsg ( " confused about regex bracket expression, bailing out " ) ;
pat = old ;
token - > type = TOKEN_REGEX ;
break ;
}
}
else {
bitmap [ * pat > > 3 ] ^ = 1 < < ( * pat & 0x7 ) ;
pat + + ;
range_start = * pat ;
hasprev = 1 ;
}
} while ( * pat ! = ' ] ' ) ;
/*TODO: see if this bitmap already exists, then reuse*/
token - > type = TOKEN_BRACKET ;
token - > u . bitmap = bitmap ;
break ;
}
case ' ] ' :
2006-10-10 23:51:49 +00:00
massert ( 0 & & " Encountered ] without matching [ " ) ;
2006-09-12 19:38:39 +00:00
/* bad state */
break ;
case ' . ' :
token - > type = TOKEN_DOT ;
break ;
case ' ( ' :
token - > type = TOKEN_PAR_OPEN ;
break ;
case ' ) ' :
token - > type = TOKEN_PAR_CLOSE ;
break ;
default :
token - > type = TOKEN_CHAR ;
2006-10-10 23:51:49 +00:00
token - > u . c = * pat ;
2006-09-12 19:38:39 +00:00
token - > len = 1 ;
break ;
}
return + + pat ;
}
# define INITIAL_ALT_STACK 10
# define ALT_STACK_GROW 20
static const unsigned char * find_regex_start ( const unsigned char * pat )
{
struct token_t token ;
/*TODO: find where the regex part begins, for ex:
* abcd + , regex begins at ' d '
* */
const unsigned char * last = NULL ;
const unsigned char * tmp = NULL ;
const unsigned char * * altpositions = cli_malloc ( INITIAL_ALT_STACK * sizeof ( * altpositions ) ) ;
size_t altpositions_capacity = INITIAL_ALT_STACK ;
size_t altpositions_cnt = 0 ;
char lasttype = - 1 ;
if ( ! altpositions )
return NULL ;
2006-10-10 23:51:49 +00:00
massert ( pat ) ;
2006-09-12 19:38:39 +00:00
/* Try to parse pattern till special regex chars are encountered, that the tree-matcher doesn't handle, like: +,*,{}.
* The tricky part is that once we encounter these , the previous ' atom ' has to be passed on to the regex matcher , so we have to
* back up to the last known good position
* Example , if we have : abc ( defg ) + , then only abc can be handled by tree parser , so we have to return the position of ( .
* Another example : abc ( defg | xyz | oz + | pdo ) , the last known good position is | , after xyz
* TODO : what about open parantheses ? maybe once we found a special char , we have top back out before the first ( ?
* */
do {
tmp = pat ;
pat = getNextToken ( pat , & token ) ;
if ( token . type ! = TOKEN_REGEX ) {
last = tmp ;
lasttype = token . type ;
2006-10-14 23:52:02 +00:00
if ( token . type = = TOKEN_BRACKET & & token . u . bitmap )
2006-09-12 19:38:39 +00:00
free ( token . u . bitmap ) ;
if ( token . type = = TOKEN_ALT | | token . type = = TOKEN_PAR_OPEN ) {
/* save this position on stack, succesfully parsed till here*/
if ( altpositions_cnt & & altpositions [ altpositions_cnt - 1 ] [ 0 ] = = ' | ' )
/* encountered another alternate (|) operator, override previous | position stored */
altpositions [ altpositions_cnt - 1 ] = last ;
else {
altpositions [ altpositions_cnt + + ] = last ;
if ( altpositions_cnt = = altpositions_capacity ) {
altpositions_capacity + = ALT_STACK_GROW ;
2007-05-25 23:10:58 +00:00
altpositions = cli_realloc2 ( altpositions , altpositions_capacity * sizeof ( * altpositions ) ) ;
2006-09-12 19:38:39 +00:00
if ( ! altpositions )
return NULL ;
}
}
} else if ( lasttype = = TOKEN_PAR_CLOSE ) {
/* remove last stored position from stack, succesfully this last group */
altpositions_cnt - - ;
2006-10-10 23:51:49 +00:00
massert ( altpositions_cnt > 0 ) ;
2006-09-12 19:38:39 +00:00
}
}
else {
if ( altpositions_cnt )
last = altpositions [ 0 /*altpositions_cnt-1*/ ] ; /*TODO: which index here?, see above TODO... */
/*last stored 'safe' position where no special (+,*,{}) regex chars were encountered*/
}
} while ( * pat & & token . type ! = TOKEN_REGEX ) ;
free ( altpositions ) ;
return * pat ? last : last + 1 ;
}
static struct tree_node * tree_node_alloc ( struct tree_node * next , char listend )
{
struct tree_node * node = cli_malloc ( sizeof ( * node ) ) ;
if ( node ) {
node - > alternatives = 0 ;
node - > next = next ;
node - > listend = listend ;
node - > u . children = NULL ;
}
return node ;
}
static struct tree_node * tree_root_alloc ( void )
{
struct tree_node * root = tree_node_alloc ( NULL , 1 ) ;
if ( root ) {
root - > op = OP_ROOT ;
root - > c = 0 ;
root - > next = NULL ;
root - > listend = 1 ;
}
return root ;
}
2006-10-10 23:51:49 +00:00
2007-03-11 11:14:35 +00:00
static struct tree_node * tree_node_char_binsearch ( const struct tree_node * node , const char csearch , int * left )
2006-09-12 19:38:39 +00:00
{
int right ;
struct tree_node * * children ;
2006-10-10 23:51:49 +00:00
massert ( node ) ;
massert ( left ) ;
2006-09-12 19:38:39 +00:00
children = tree_node_get_children ( node ) ;
right = node - > alternatives - 1 ;
* left = 0 ;
if ( ! node - > alternatives )
return NULL ;
2006-10-10 23:51:49 +00:00
massert ( children ) ;
2006-09-12 19:38:39 +00:00
while ( * left < = right ) {
int mid = * left + ( right - * left ) / 2 ;
if ( children [ mid ] - > c = = csearch )
return children [ mid ] ;
else if ( children [ mid ] - > c < csearch )
* left = mid + 1 ;
else
right = mid - 1 ;
}
return NULL ;
}
2007-03-11 11:14:35 +00:00
static struct tree_node * tree_get_next ( struct tree_node * node )
2006-09-12 19:38:39 +00:00
{
struct tree_node * * children ;
2006-10-10 23:51:49 +00:00
massert ( node ) ;
2006-09-12 19:38:39 +00:00
children = tree_node_get_children ( node ) ;
if ( ! node - > alternatives & & children & & children [ 0 ] )
return children [ 0 ] ;
else if ( node - > alternatives < = 1 )
return node ;
else
return children [ 0 ] - > next ;
}
2007-03-11 11:14:35 +00:00
static size_t tree_node_get_array_size ( const struct tree_node * node )
2006-09-12 19:38:39 +00:00
{
2006-10-10 23:51:49 +00:00
massert ( node ) ;
2006-09-12 19:38:39 +00:00
/* if op is CUSTOMCLASS, then first pointer is pointer to bitmap, so array size is +1 */
return ( node - > alternatives + ( node - > op = = OP_CUSTOMCLASS ? 1 : 0 ) ) * sizeof ( node - > u . children [ 0 ] ) ;
}
2007-03-11 11:14:35 +00:00
static struct tree_node * tree_node_char_insert ( struct tree_node * node , const char c , int left )
2006-09-12 19:38:39 +00:00
{
struct tree_node * new , * alt = tree_get_next ( node ) ;
2006-10-10 23:51:49 +00:00
struct tree_node * * children ;
2006-09-12 19:38:39 +00:00
node - > alternatives + + ;
2007-05-25 23:10:58 +00:00
node - > u . children = cli_realloc2 ( node - > u . children , tree_node_get_array_size ( node ) ) ;
2006-09-12 19:38:39 +00:00
if ( ! node - > u . children )
return NULL ;
2006-10-10 23:51:49 +00:00
children = node - > op = = OP_CUSTOMCLASS ? node - > u . children + 1 : node - > u . children ;
2006-09-12 19:38:39 +00:00
new = tree_node_alloc ( alt , node = = alt ) ;
if ( new ) {
new - > op = OP_CHAR ;
new - > c = c ;
}
if ( node - > alternatives - left - 1 > 0 )
2006-10-10 23:51:49 +00:00
memmove ( & children [ left + 1 ] , & children [ left ] , ( node - > alternatives - left - 1 ) * sizeof ( node - > u . children [ 0 ] ) ) ;
children [ left ] = new ;
2006-09-12 19:38:39 +00:00
return new ;
}
2007-03-11 11:14:35 +00:00
static void tree_node_insert_nonbin ( struct tree_node * node , struct tree_node * new )
2006-09-12 19:38:39 +00:00
{
struct tree_node * * children ;
2006-10-10 23:51:49 +00:00
massert ( node ) ;
massert ( new ) ;
2006-09-12 19:38:39 +00:00
children = tree_node_get_children ( node ) ;
if ( node - > alternatives ) {
2006-10-10 23:51:49 +00:00
massert ( children ) ;
2006-09-12 19:38:39 +00:00
if ( children [ 0 ] - > next = = node ) {
int i ;
new - > listend = 1 ;
for ( i = 0 ; i < node - > alternatives ; i + + ) {
children [ i ] - > next = new ;
children [ i ] - > listend = 0 ;
}
}
else {
struct tree_node * p ;
for ( p = children [ 0 ] - > next ; p - > next ! = node ; p = p - > next )
2006-10-10 23:51:49 +00:00
massert ( ! p - > listend ) ;
2006-09-12 19:38:39 +00:00
new - > listend = 1 ;
p - > listend = 0 ;
p - > next = new ;
}
}
else {
2006-09-24 19:28:03 +00:00
int idx = node - > op = = OP_CUSTOMCLASS ? 1 : 0 ;
2006-09-12 19:38:39 +00:00
if ( node - > u . children )
2006-09-24 19:28:03 +00:00
if ( node - > u . children [ idx ] ) {
node = node - > u . children [ idx ] ;
while ( node - > next & & ! node - > listend )
node = node - > next ;
node - > listend = 0 ;
2007-12-06 15:29:54 +00:00
new - > next = node - > next ;
2006-09-24 19:28:03 +00:00
node - > next = new ;
new - > listend = 1 ;
2007-02-17 19:16:19 +00:00
return ;
2006-09-24 19:28:03 +00:00
}
2007-05-25 23:10:58 +00:00
node - > u . children = cli_realloc2 ( node - > u . children , sizeof ( node - > u . children [ 0 ] ) * ( 2 ) ) ;
2006-09-24 19:28:03 +00:00
if ( node - > u . children ) {
node - > u . children [ idx ] = new ;
}
2006-09-12 19:38:39 +00:00
}
}
2007-03-11 11:14:35 +00:00
static unsigned char char_getclass ( const unsigned char * bitmap )
2006-09-12 19:38:39 +00:00
{
size_t i ;
2006-10-10 23:51:49 +00:00
massert ( bitmap ) ;
2006-09-12 19:38:39 +00:00
for ( i = 0 ; i < std_class_cnt ; i + + )
if ( ! memcmp ( bitmap , char_class_bitmap [ i ] , 256 > > 3 ) )
return i ;
return std_class_cnt ;
}
static void stack_destroy ( struct node_stack * stack )
{
2006-10-10 23:51:49 +00:00
massert ( stack ) ;
2006-09-12 19:38:39 +00:00
if ( stack - > data )
free ( stack - > data ) ;
stack - > data = NULL ;
stack - > capacity = 0 ;
}
/* call this after whitelist load is complete, and the tree is no longer going to be modified */
void regex_list_cleanup ( struct regex_matcher * matcher )
{
2006-10-10 23:51:49 +00:00
massert ( matcher ) ;
2006-09-12 19:38:39 +00:00
stack_destroy ( & matcher - > node_stack ) ;
stack_destroy ( & matcher - > node_stack_alt ) ;
stack_init ( & matcher - > node_stack ) ;
stack_init ( & matcher - > node_stack_alt ) ;
}
int is_regex_ok ( struct regex_matcher * matcher )
{
2006-10-10 23:51:49 +00:00
massert ( matcher ) ;
2006-09-12 19:38:39 +00:00
return ( ! matcher - > list_inited | | matcher - > list_inited ! = - 1 ) ; /* either we don't have a regexlist, or we initialized it successfully */
}
/* returns 0 on success, regexec error code otherwise */
2007-03-18 23:27:15 +00:00
static int add_pattern ( struct regex_matcher * matcher , const unsigned char * pat , const char * info , int hostonly )
2006-09-12 19:38:39 +00:00
{
int bol = 1 ;
const unsigned char * pat_end = find_regex_start ( pat ) ;
struct token_t token ;
struct tree_node * node ;
2006-10-10 23:51:49 +00:00
massert ( matcher ) ;
2006-09-12 19:38:39 +00:00
2007-03-18 23:27:15 +00:00
node = hostonly ? matcher - > root_regex_hostonly : matcher - > root_regex ;
2006-09-12 19:38:39 +00:00
stack_reset ( & matcher - > node_stack ) ;
stack_reset ( & matcher - > node_stack_alt ) ;
stack_push ( & matcher - > node_stack , node ) ;
for ( ; node - > op ! = OP_LEAF ; ) {
if ( pat < pat_end )
pat = getNextToken ( pat , & token ) ;
else if ( * pat ) {
token . type = TOKEN_REGEX ;
token . u . start = pat ;
}
else
token . type = TOKEN_DONE ;
switch ( token . type ) {
case TOKEN_CHAR :
{
/* search for char in tree */
int left ;
2006-10-10 23:51:49 +00:00
struct tree_node * newnode = tree_node_char_binsearch ( node , token . u . c , & left ) ;
2006-09-12 19:38:39 +00:00
if ( newnode )
node = newnode ;
else {
/* not found, insert it */
2006-10-10 23:51:49 +00:00
node = tree_node_char_insert ( node , token . u . c , left ) ;
2006-09-12 19:38:39 +00:00
}
break ;
}
case TOKEN_PAR_OPEN :
stack_push ( & matcher - > node_stack_alt , NULL ) ; /* marker */
stack_push ( & matcher - > node_stack , node ) ;
break ;
case TOKEN_PAR_CLOSE : {
/*TODO: test this!!!*/
struct tree_node * node_alt = node ;
node = tree_node_alloc ( NULL , 1 ) ;
node - > op = OP_PARCLOSE ;
node - > c = 0 ;
node - > listend = 1 ;
tree_node_insert_nonbin ( node_alt , node ) ;
while ( ( node_alt = stack_pop ( & matcher - > node_stack_alt ) ) ) {
tree_node_insert_nonbin ( node_alt , node ) ;
}
stack_pop ( & matcher - > node_stack ) ;
break ;
}
case TOKEN_ALT :
stack_push ( & matcher - > node_stack_alt , node ) ;
node = stack_pop ( & matcher - > node_stack ) ;
stack_push ( & matcher - > node_stack , node ) ;
break ;
case TOKEN_BRACKET :
{
struct tree_node * new = tree_node_alloc ( tree_get_next ( node ) , 1 ) ;
2006-10-10 23:51:49 +00:00
unsigned char charclass = char_getclass ( token . u . bitmap ) ;
2006-09-12 19:38:39 +00:00
if ( charclass = = std_class_cnt ) { /*not a std char class*/
new - > op = OP_CUSTOMCLASS ;
new - > u . children = cli_malloc ( sizeof ( new - > u . children [ 0 ] ) * 2 ) ;
2006-10-14 23:52:02 +00:00
if ( ! new - > u . children )
return CL_EMEM ;
2006-09-12 19:38:39 +00:00
new - > u . bitmap [ 0 ] = token . u . bitmap ;
new - > u . bitmap [ 1 ] = NULL ;
tree_node_insert_nonbin ( node , new ) ;
node = new ;
}
else {
new - > op = OP_STDCLASS ;
new - > c = charclass ;
tree_node_insert_nonbin ( node , new ) ;
node = new ;
}
break ;
}
case TOKEN_DOT :
{
struct tree_node * new = tree_node_alloc ( tree_get_next ( node ) , 1 ) ;
new - > op = OP_DOT ;
tree_node_insert_nonbin ( node , new ) ;
node = new ;
break ;
}
case TOKEN_REGEX :
case TOKEN_DONE : {
struct leaf_info * leaf = cli_malloc ( sizeof ( * leaf ) ) ;
2006-10-14 23:52:02 +00:00
if ( ! leaf )
return CL_EMEM ;
2007-03-11 11:14:35 +00:00
leaf - > info = cli_strdup ( info ) ;
2006-09-12 19:38:39 +00:00
if ( token . type = = TOKEN_REGEX ) {
int rc ;
struct tree_node * new ;
regex_t * preg ;
preg = cli_malloc ( sizeof ( * preg ) ) ;
2006-10-14 23:52:02 +00:00
if ( ! preg )
return CL_EMEM ;
2007-09-17 18:54:56 +00:00
rc = cli_regcomp ( preg , ( const char * ) token . u . start , REG_EXTENDED | ( bol ? 0 : REG_NOTBOL ) ) ;
2006-09-12 19:38:39 +00:00
leaf - > preg = preg ;
if ( rc )
return rc ;
new = cli_malloc ( sizeof ( * new ) ) ;
2006-10-14 23:52:02 +00:00
if ( ! new )
return CL_EMEM ;
2006-09-12 19:38:39 +00:00
new - > op = OP_LEAF ;
new - > next = node ;
new - > alternatives = 0 ;
new - > u . leaf = leaf ;
new - > listend = 1 ;
tree_node_insert_nonbin ( node , new ) ;
}
else {
leaf - > preg = NULL ;
node - > alternatives = 0 ;
node - > u . leaf = leaf ;
node - > op = OP_LEAF ;
}
return 0 ;
}
}
bol = 0 ;
}
return 0 ;
}
/* c has to be unsigned char here!! */
static int match_node ( struct tree_node * node , const unsigned char * c , size_t len , const char * * info )
{
struct tree_node * * children ;
int rc ;
2006-10-10 23:51:49 +00:00
massert ( node ) ;
massert ( c ) ;
massert ( info ) ;
2006-09-12 19:38:39 +00:00
2006-09-16 15:49:27 +00:00
if ( ! node - > u . children )
return MATCH_FAILED ; /* tree empty */
2006-09-12 19:38:39 +00:00
* info = NULL ;
len + + ;
c - - ;
for ( ; ; ) {
2006-10-10 23:51:49 +00:00
massert ( node ) ;
2006-09-12 19:38:39 +00:00
children = node - > u . children ;
switch ( node - > op ) {
case OP_ROOT :
rc = 1 ;
break ;
case OP_PARCLOSE :
/*this isn't a real character, so don't move*/
c - - ;
len + + ;
rc = 1 ;
break ;
case OP_CHAR :
2006-10-10 23:51:49 +00:00
massert ( * c = = node - > c & & " We know this has to match " ) ;
2006-09-12 19:38:39 +00:00
rc = 1 ; /* *c==node->c;- we know it has matched */
break ;
case OP_DOT :
rc = * c ! = ' \n ' ;
break ;
case OP_STDCLASS :
rc = char_class [ * c ] & ( node - > c ) ;
break ;
case OP_CUSTOMCLASS :
{
char_bitmap_p bitmap ;
2006-10-10 23:51:49 +00:00
massert ( children ) ;
2006-09-12 19:38:39 +00:00
bitmap = ( char_bitmap_p ) node - > u . bitmap [ 0 ] ;
children + + ;
rc = bitmap [ * c > > 3 ] & ( 1 < < ( * c & 0x7 ) ) ;
break ;
}
case OP_LEAF :
{
const struct leaf_info * leaf = node - > u . leaf ;
/*isleaf = 1;*/
if ( leaf - > preg ) {
2007-09-17 18:54:56 +00:00
rc = ! cli_regexec ( leaf - > preg , ( const char * ) c , 0 , NULL , 0 ) ;
2006-09-12 19:38:39 +00:00
}
else {
2006-10-10 23:51:49 +00:00
massert ( * c = = node - > c & & " We know this has to match[2] " ) ;
2006-09-12 19:38:39 +00:00
rc = 1 ;
}
if ( rc ) {
* info = leaf - > info ;
return MATCH_SUCCESS ;
}
break ;
}
default :
/* impossible */
cli_errmsg ( " Encountered invalid operator in tree:%d \n " , node - > op ) ;
exit ( 1 ) ;
}
len - - ;
if ( ! len ) rc = 0 ;
c + + ;
if ( rc ) {
const char csearch = * c ;
int left = 0 , right = node - > alternatives - 1 ;
int mid ;
/*matched so far, go deeper*/
/*do a binary search between children */
2006-10-10 23:51:49 +00:00
massert ( children ) ;
2006-09-12 19:38:39 +00:00
while ( left < = right ) {
mid = left + ( right - left ) / 2 ;
if ( children [ mid ] - > c = = csearch )
break ;
else if ( children [ mid ] - > c < csearch )
left = mid + 1 ;
else
right = mid - 1 ;
}
if ( left < = right ) {
node = children [ mid ] ;
2006-10-10 23:51:49 +00:00
massert ( node ) ;
2006-09-12 19:38:39 +00:00
}
else {
if ( node - > alternatives ) {
if ( ! children [ 0 ] - > listend ) {
node = children [ 0 ] ;
c + + ;
len - - ;
}
while ( node & & node - > listend ) {
node = node - > next ; /* climb up */
c - - ;
len + + ;
}
if ( ! node | | ! node - > next )
return MATCH_FAILED ; /* reached root node */
node = node - > next ;
c - - ;
len + + ;
}
else if ( node - > u . children ) {
struct tree_node * rewrite_next = NULL ;
if ( node - > op = = OP_PARCLOSE )
rewrite_next = node ;
node = children [ 0 ] ;
2006-10-10 23:51:49 +00:00
massert ( node ) ;
massert ( node - > op ! = OP_CHAR ) ;
2006-09-12 19:38:39 +00:00
if ( rewrite_next )
node - > next = rewrite_next ; /* this node is pointed to by several parent nodes,
we need to know
from which one we came , so we can find out way back
should we fail to match somewhere deeper */
}
}
}
else {
/* this node didn't match, try sibling, or parent (if no more siblings) */
while ( node & & node - > listend ) {
node = node - > next ; /* sibling of parent */
c - - ;
len + + ;
}
if ( ! node | | ! node - > next ) /* reached root node, it has no next */
return MATCH_FAILED ;
2006-09-24 19:28:03 +00:00
else {
c - - ;
len + + ;
node = node - > next ;
}
2006-09-12 19:38:39 +00:00
}
}
return MATCH_FAILED ;
}
/* push node on stack, only if it isn't there already */
2007-03-11 11:14:35 +00:00
static void stack_push_once ( struct node_stack * stack , struct tree_node * node )
2006-09-12 19:38:39 +00:00
{
size_t i ;
2006-10-10 23:51:49 +00:00
massert ( stack ) ;
massert ( node ) ;
2006-09-12 19:38:39 +00:00
for ( i = 0 ; i < stack - > cnt ; i + + )
if ( stack - > data [ i ] = = node )
return ;
stack_push ( stack , node ) ;
}
static void destroy_tree_internal ( struct regex_matcher * matcher , struct tree_node * node )
{
struct tree_node * * children ;
2006-10-10 23:51:49 +00:00
massert ( matcher ) ;
massert ( node ) ;
2006-09-12 19:38:39 +00:00
children = tree_node_get_children ( node ) ;
if ( node - > op = = OP_LEAF ) {
struct leaf_info * leaf = node - > u . leaf ;
if ( node - > next & & ! node - > listend )
destroy_tree_internal ( matcher , node - > next ) ;
stack_push_once ( & matcher - > node_stack , ( struct tree_node * ) node - > u . leaf ) ; /* cast to make compiler happy, and to not make another stack implementation for storing void* */
stack_push_once ( & matcher - > node_stack , node ) ;
if ( leaf - > preg ) {
2007-09-17 18:54:56 +00:00
cli_regfree ( leaf - > preg ) ;
2006-09-12 19:38:39 +00:00
free ( leaf - > preg ) ;
leaf - > preg = NULL ;
}
if ( leaf - > info ) {
free ( leaf - > info ) ;
leaf - > info = NULL ;
}
/* return;*/
}
if ( node - > alternatives ) {
int i ;
struct tree_node * p ;
2006-10-10 23:51:49 +00:00
massert ( children ) ;
2006-09-12 19:38:39 +00:00
p = children [ 0 ] - > op = = OP_LEAF ? NULL : children [ 0 ] - > next ;
for ( i = 0 ; i < node - > alternatives ; i + + )
destroy_tree_internal ( matcher , children [ i ] ) ;
if ( p & & p ! = node )
destroy_tree_internal ( matcher , p ) ; /*?? is this ok, or without _internal?*/
}
else {
if ( children ) {
if ( children [ 0 ] )
destroy_tree_internal ( matcher , children [ 0 ] ) ;
}
}
2006-09-24 19:28:03 +00:00
if ( node - > op ! = OP_LEAF & & node - > next & & ! node - > listend )
2006-09-12 19:38:39 +00:00
destroy_tree_internal ( matcher , node - > next ) ;
if ( node - > u . children )
stack_push_once ( & matcher - > node_stack , ( struct tree_node * ) node - > u . children ) ; /* cast to make compiler happy, it isn't really a tree_node* */
if ( node - > op = = OP_CUSTOMCLASS & & node - > u . children [ 0 ] ) {
free ( node - > u . children [ 0 ] ) ;
node - > u . children [ 0 ] = NULL ;
}
stack_push_once ( & matcher - > node_stack , node ) ;
}
static void destroy_tree ( struct regex_matcher * matcher )
{
/* we might have the same node linked by different nodes, so a recursive walk&free doesn't work in all situations,
* i . e . it might double - free , so instead of freeing , just push the nodes on a stack , and later free the nodes in that stack ,
* ( and push to stack only if it doesn ' t contain it already */
2006-10-10 23:51:49 +00:00
massert ( matcher ) ;
2006-09-12 19:38:39 +00:00
stack_reset ( & matcher - > node_stack ) ;
destroy_tree_internal ( matcher , matcher - > root_regex ) ;
2007-03-18 23:27:15 +00:00
destroy_tree_internal ( matcher , matcher - > root_regex_hostonly ) ;
2006-09-12 19:38:39 +00:00
while ( matcher - > node_stack . cnt ) {
struct tree_node * node = stack_pop ( & matcher - > node_stack ) ;
2006-10-14 23:52:02 +00:00
if ( node )
free ( node ) ;
2006-09-12 19:38:39 +00:00
}
}
# ifndef NDEBUG
static void dump_node ( struct tree_node * node )
{
int i ;
struct tree_node * p , * * children ;
2006-10-10 23:51:49 +00:00
massert ( node ) ;
2006-09-12 19:38:39 +00:00
if ( node - > op = = OP_LEAF ) {
if ( node - > u . leaf - > preg )
printf ( " n%p [label= \" regex \\ nleaf \" ] " , ( void * ) node ) ;
else
printf ( " n%p [label= \" %c \\ nleaf \" ]; \n " , ( void * ) node , node - > c ) ;
if ( node - > next & & ! node - > listend ) {
printf ( " n%p -> n%p; \n " , ( void * ) node , ( void * ) node - > next ) ;
dump_node ( node - > next ) ;
}
return ;
}
printf ( " n%p [label= \" %c \\ n%d \\ nlistend:%d \" ]; \n " , ( void * ) node , ( node - > op = = OP_ROOT | | node - > op = = OP_PARCLOSE ) ? ' @ ' : node - > c , node - > op , node - > listend ) ;
if ( node - > next )
printf ( " n%p -> n%p; \n " , ( void * ) node , ( void * ) node - > next ) ;
printf ( " n%p -> { " , ( void * ) node ) ; /*using address of node as id*/
children = tree_node_get_children ( node ) ;
if ( node - > alternatives )
2006-10-10 23:51:49 +00:00
massert ( children ) ;
2006-09-12 19:38:39 +00:00
for ( i = 0 ; i < node - > alternatives ; i + + )
printf ( " n%p " , ( void * ) children [ i ] ) ;
if ( node - > alternatives & & children [ 0 ] - > op ! = OP_LEAF )
for ( p = children [ 0 ] - > next ; p ! = node ; p = p - > next )
{
2006-10-10 23:51:49 +00:00
massert ( p ) ;
2006-09-12 19:38:39 +00:00
printf ( " n%p " , ( void * ) p ) ;
if ( p - > op = = OP_LEAF | | p - > listend )
break ;
}
if ( ! node - > alternatives & & children & & children [ 0 ] )
printf ( " n%p " , ( void * ) children [ 0 ] ) ;
printf ( " }; \n " ) ;
printf ( " {rank=same; " ) ;
for ( i = 0 ; i < node - > alternatives ; i + + )
printf ( " n%p " , ( void * ) node - > u . children [ i ] ) ;
if ( node - > alternatives & & children [ 0 ] - > op ! = OP_LEAF )
for ( p = children [ 0 ] - > next ; p ! = node ; p = p - > next )
{
printf ( " n%p " , ( void * ) p ) ;
if ( p - > op = = OP_LEAF | | p - > listend )
break ;
}
if ( ! node - > alternatives & & children & & children [ 0 ] )
printf ( " n%p " , ( void * ) children [ 0 ] ) ;
printf ( " }; \n " ) ;
for ( i = 0 ; i < node - > alternatives ; i + + )
dump_node ( children [ i ] ) ;
if ( node - > alternatives & & children [ 0 ] - > op ! = OP_LEAF )
for ( p = children [ 0 ] - > next ; p ! = node ; p = p - > next )
{
dump_node ( p ) ;
if ( p - > op = = OP_LEAF | | p - > listend )
break ;
}
if ( ! node - > alternatives & & children & & children [ 0 ] )
dump_node ( children [ 0 ] ) ;
}
void dump_tree ( struct tree_node * root )
{
/*use dot/dotty from graphviz to view it*/
2006-10-10 23:51:49 +00:00
massert ( root ) ;
2006-09-12 19:38:39 +00:00
printf ( " digraph tree { \n " ) ;
dump_node ( root ) ;
printf ( " } \n " ) ;
}
# endif