uniq update

git-svn: trunk@4069
This commit is contained in:
aCaB 2008-08-03 21:39:29 +00:00
parent 937ade082c
commit 53ed2cb75c
5 changed files with 47 additions and 152 deletions

View file

@ -1,3 +1,8 @@
Sun Aug 3 23:09:44 CEST 2008 (acab)
------------------------------------
* libclamav/uniq: faster md5 lookup
* sigtool: sync
Sun Aug 3 16:12:17 CEST 2008 (acab) Sun Aug 3 16:12:17 CEST 2008 (acab)
------------------------------------ ------------------------------------
* libclamav: use md5 based lookup for ole2/vba instead of hashtab (bb#1071) * libclamav: use md5 based lookup for ole2/vba instead of hashtab (bb#1071)

View file

@ -24,118 +24,21 @@
#include "clamav-config.h" #include "clamav-config.h"
#endif #endif
#include <stdlib.h>
#if HAVE_STRING_H
#include <string.h>
#endif
#include "uniq.h" #include "uniq.h"
#include "others.h"
#include "md5.h" #include "md5.h"
#if 0
struct uniq *uniq_init(uint32_t count) { struct uniq *uniq_init(uint32_t count) {
struct uniq *U; struct uniq *U;
uint32_t i;
if(!count) return NULL; if(!count) return NULL;
U = cli_calloc(1, sizeof(*U)); U = cli_calloc(1, sizeof(*U));
if(!U) return NULL; if(!U) return NULL;
if(cli_ac_init(&U->matcher, 16, 16)) {
uniq_free(U);
return NULL;
}
U->custs = cli_calloc(count, sizeof(U->custs));
if(!U->custs) {
uniq_free(U);
return NULL;
}
U->patts = cli_calloc(count, sizeof(U->patts));
if(!U->patts) {
uniq_free(U);
return NULL;
}
U->md5s = cli_malloc(count*sizeof(U->md5s));
if(!U->md5s) {
uniq_free(U);
return NULL;
}
U->entries = count;
for(i=0; i<count; i++) {
U->patts[i].pattern = U->md5s[i].md5;
U->patts[i].length = 16;
U->patts[i].ch[0] = U->patts[i].ch[1] |= CLI_MATCH_IGNORE;
U->patts[i].customdata = &U->custs[i];
}
return U;
}
void uniq_free(struct uniq *U) {
uint32_t i;
U->matcher.ac_patterns = 0; /* don't free my arrays! */
cli_ac_free(&U->matcher);
if(U->custs) free(U->custs);
if(U->patts) free(U->patts);
if(U->md5s) free(U->md5s);
free(U);
}
uint32_t uniq_add(struct uniq *U, const char *key, uint32_t key_len, char **rhash) {
uint8_t digest[16];
struct UNIQCUST *cust;
struct cli_ac_data mdata;
cli_md5_ctx md5;
cli_md5_init(&md5);
cli_md5_update(&md5, key, key_len);
cli_md5_final(digest, &md5);
cli_ac_initdata(&mdata, 0, 0, AC_DEFAULT_TRACKLEN); /* This can't fail as we don't have parts or lsigs */
if (cli_ac_scanbuff(digest,16, NULL, (void *)&cust, NULL, &U->matcher, &mdata,0,0,-1,NULL,AC_SCAN_VIR,NULL)!=CL_VIRUS) {
int i;
char HEX[] = { '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'a', 'b', 'c', 'd', 'e', 'f' };
struct cli_ac_patt *patt = &U->patts[U->matcher.ac_patterns];
cust = patt->customdata;
for(i = 0; i < 16; i++) {
cust->name[i*2] = HEX[digest[i]>>4 & 0xf];
cust->name[i*2+1] = HEX[digest[i] & 0xf];
patt->pattern[i] = digest[i];
}
cli_ac_addpatt(&U->matcher,patt); /* FIXME this can fail */
cli_ac_buildtrie(&U->matcher);
}
cust->count++;
if(rhash) *rhash = cust->name;
return cust->count;
}
uint32_t uniq_get(struct uniq *U, const char *key, uint32_t key_len, char **rhash) {
uint8_t digest[16];
struct UNIQCUST *cust;
struct cli_ac_data mdata;
cli_md5_ctx md5;
cli_md5_init(&md5);
cli_md5_update(&md5, key, key_len);
cli_md5_final(digest, &md5);
cli_ac_initdata(&mdata, 0, 0, AC_DEFAULT_TRACKLEN); /* This can't fail as we don't have parts or lsigs */
if (cli_ac_scanbuff(digest,16, NULL, (void *)&cust, NULL, &U->matcher, &mdata,0,0,-1,NULL,AC_SCAN_VIR,NULL)!=CL_VIRUS)
return 0;
if(rhash) *rhash = cust->name;
return cust->count;
}
#else
#include <string.h>
struct uniq *uniq_init(uint32_t count) {
struct uniq *U;
if(!count) return NULL;
U = cli_malloc(sizeof(*U));
if(!U) return NULL;
U->md5s = cli_malloc(count * sizeof(*U->md5s)); U->md5s = cli_malloc(count * sizeof(*U->md5s));
if(!U->md5s) { if(!U->md5s) {
@ -143,7 +46,6 @@ struct uniq *uniq_init(uint32_t count) {
return NULL; return NULL;
} }
U->items = 0;
return U; return U;
} }
@ -156,50 +58,59 @@ uint32_t uniq_add(struct uniq *U, const char *key, uint32_t key_len, char **rhas
unsigned int i; unsigned int i;
uint8_t digest[16]; uint8_t digest[16];
cli_md5_ctx md5; cli_md5_ctx md5;
struct UNIQMD5 *m; struct UNIQMD5 *m = NULL;
cli_md5_init(&md5); cli_md5_init(&md5);
cli_md5_update(&md5, key, key_len); cli_md5_update(&md5, key, key_len);
cli_md5_final(digest, &md5); cli_md5_final(digest, &md5);
for(i=0; i<U->items; i++) { if(U->items && U->md5s[U->idx[*digest]].md5[0]==*digest)
if(memcmp(digest, U->md5s[i].md5, 16)) continue; for(m=&U->md5s[U->idx[*digest]]; m; m=m->next)
m = &U->md5s[i]; if(!memcmp(&digest[1], &m->md5[1], 15)) break;
break;
}
if(i==U->items) { if(!m) {
char HEX[] = { '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'a', 'b', 'c', 'd', 'e', 'f' }; const char HEX[] = { '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'a', 'b', 'c', 'd', 'e', 'f' };
m = &U->md5s[i];
m = &U->md5s[U->items];
m->count = 0; m->count = 0;
if(U->items && U->md5s[U->idx[*digest]].md5[0]==*digest)
m->next = &U->md5s[U->idx[*digest]];
else
m->next = NULL;
U->idx[*digest]=U->items;
for(i = 0; i < 16; i++) { for(i = 0; i < 16; i++) {
m->name[i*2] = HEX[digest[i]>>4 & 0xf]; m->name[i*2] = HEX[digest[i]>>4 & 0xf];
m->name[i*2+1] = HEX[digest[i] & 0xf]; m->name[i*2+1] = HEX[digest[i] & 0xf];
m->md5[i] = digest[i]; m->md5[i] = digest[i];
} }
m->name[32] = '\0'; m->name[32] = '\0';
} }
U->items++; U->items++;
if(rhash) *rhash = m->name; if(rhash) *rhash = m->name;
return m->count++; return m->count++;
} }
uint32_t uniq_get(struct uniq *U, const char *key, uint32_t key_len, char **rhash) { uint32_t uniq_get(struct uniq *U, const char *key, uint32_t key_len, char **rhash) {
unsigned int i;
uint8_t digest[16]; uint8_t digest[16];
cli_md5_ctx md5; cli_md5_ctx md5;
struct UNIQMD5 *m = NULL;
cli_md5_init(&md5); cli_md5_init(&md5);
cli_md5_update(&md5, key, key_len); cli_md5_update(&md5, key, key_len);
cli_md5_final(digest, &md5); cli_md5_final(digest, &md5);
for(i=0; i<U->items; i++) { if(!U->items || U->md5s[U->idx[*digest]].md5[0]!=*digest)
if(memcmp(digest, U->md5s[i].md5, 16)) continue; return 0;
if(rhash) *rhash = U->md5s[i].name;
return U->md5s[i].count; for(m=&U->md5s[U->idx[*digest]]; m; m=m->next) {
if(memcmp(&digest[1], &m->md5[1], 15)) continue;
if(rhash) *rhash = m->name;
return m->count;
} }
return 0; return 0;
} }
#endif

View file

@ -23,46 +23,25 @@
#ifndef _UNIQ_H #ifndef _UNIQ_H
#define _UNIQ_H #define _UNIQ_H
#include "matcher.h"
#include "cltypes.h" #include "cltypes.h"
#if 0
struct UNIQCUST {
char name[33];
uint32_t count;
};
struct UNIQMD5 {
uint16_t md5[16];
};
/* A basic storage for unique IDs */
struct uniq {
struct cli_matcher matcher;
struct cli_ac_patt *patts;
struct UNIQMD5 *md5s;
struct UNIQCUST *custs;
uint32_t entries;
};
#else
struct UNIQMD5 { struct UNIQMD5 {
struct UNIQMD5 *next;
uint32_t count; uint32_t count;
uint8_t md5[16]; uint8_t md5[16];
char name[33]; char name[33];
}; };
struct uniq { struct uniq {
uint32_t items;
struct UNIQMD5 *md5s; struct UNIQMD5 *md5s;
uint32_t items;
uint32_t idx[256];
}; };
#endif
struct uniq *uniq_init(uint32_t); struct uniq *uniq_init(uint32_t);
void uniq_free(struct uniq *); void uniq_free(struct uniq *);
uint32_t uniq_add(struct uniq *, const char *, uint32_t, char **); uint32_t uniq_add(struct uniq *, const char *, uint32_t, char **);
uint32_t uniq_get(struct uniq *, const char *, uint32_t, char **); uint32_t uniq_get(struct uniq *, const char *, uint32_t, char **);
#endif #endif

View file

@ -1041,9 +1041,9 @@ int sigtool_vba_scandir (const char *dirname, int hex_output, struct uniq *U)
DIR *dd; DIR *dd;
struct dirent *dent; struct dirent *dent;
struct stat statbuf; struct stat statbuf;
char *fullname, vbaname[1024]; char *fullname, vbaname[1024], *hash;
unsigned char *data; unsigned char *data;
uint32_t hashcnt, hash; uint32_t hashcnt;
hashcnt = uniq_get(U, "_vba_project", 12, NULL); hashcnt = uniq_get(U, "_vba_project", 12, NULL);
while(hashcnt--) { while(hashcnt--) {
@ -1051,7 +1051,7 @@ int sigtool_vba_scandir (const char *dirname, int hex_output, struct uniq *U)
for(i = 0; i < vba_project->count; i++) { for(i = 0; i < vba_project->count; i++) {
for(j = 0; j < vba_project->colls[i]; j++) { for(j = 0; j < vba_project->colls[i]; j++) {
snprintf(vbaname, 1024, "%s/%u_%u", vba_project->dir, vba_project->name[i], j); snprintf(vbaname, 1024, "%s/%s_%u", vba_project->dir, vba_project->name[i], j);
vbaname[sizeof(vbaname)-1] = '\0'; vbaname[sizeof(vbaname)-1] = '\0';
fd = open(vbaname, O_RDONLY|O_BINARY); fd = open(vbaname, O_RDONLY|O_BINARY);
if(fd == -1) continue; if(fd == -1) continue;
@ -1077,7 +1077,7 @@ int sigtool_vba_scandir (const char *dirname, int hex_output, struct uniq *U)
if((hashcnt = uniq_get(U, "powerpoint document", 19, &hash))) { if((hashcnt = uniq_get(U, "powerpoint document", 19, &hash))) {
while(hashcnt--) { while(hashcnt--) {
snprintf(vbaname, 1024, "%s/%u_%u", dirname, hash, hashcnt); snprintf(vbaname, 1024, "%s/%s_%u", dirname, hash, hashcnt);
vbaname[sizeof(vbaname)-1] = '\0'; vbaname[sizeof(vbaname)-1] = '\0';
fd = open(vbaname, O_RDONLY|O_BINARY); fd = open(vbaname, O_RDONLY|O_BINARY);
if (fd == -1) continue; if (fd == -1) continue;
@ -1093,7 +1093,7 @@ int sigtool_vba_scandir (const char *dirname, int hex_output, struct uniq *U)
if ((hashcnt = uniq_get(U, "worddocument", 12, &hash))) { if ((hashcnt = uniq_get(U, "worddocument", 12, &hash))) {
while(hashcnt--) { while(hashcnt--) {
snprintf(vbaname, sizeof(vbaname), "%s/%u_%u", dirname, hash, hashcnt); snprintf(vbaname, sizeof(vbaname), "%s/%s_%u", dirname, hash, hashcnt);
vbaname[sizeof(vbaname)-1] = '\0'; vbaname[sizeof(vbaname)-1] = '\0';
fd = open(vbaname, O_RDONLY|O_BINARY); fd = open(vbaname, O_RDONLY|O_BINARY);
if (fd == -1) continue; if (fd == -1) continue;

View file

@ -20,7 +20,7 @@
#ifndef __VBA_H #ifndef __VBA_H
#define __VBA_H #define __VBA_H
#include "libclamav/hashtab.h" #include "libclamav/uniq.h"
int sigtool_vba_scandir(const char *dirname, int hex_output, struct uniq *U); int sigtool_vba_scandir(const char *dirname, int hex_output, struct uniq *U);
#endif #endif