2004-07-02 23:00:58 +00:00
/*
2008-04-02 15:24:51 +00:00
* Copyright ( C ) 2007 - 2008 Sourcefire , Inc .
2007-12-14 22:39:37 +00:00
*
2008-04-02 15:24:51 +00:00
* Authors : Tomasz Kojm
2004-07-02 23:00:58 +00:00
*
* This program is free software ; you can redistribute it and / or modify
2007-03-31 20:31:04 +00:00
* it under the terms of the GNU General Public License version 2 as
* published by the Free Software Foundation .
2004-07-02 23:00:58 +00:00
*
* This program is distributed in the hope that it will be useful ,
* but WITHOUT ANY WARRANTY ; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE . See the
* GNU General Public License for more details .
*
* You should have received a copy of the GNU General Public License
* along with this program ; if not , write to the Free Software
2006-04-09 19:59:28 +00:00
* Foundation , Inc . , 51 Franklin Street , Fifth Floor , Boston ,
* MA 02110 - 1301 , USA .
2004-07-02 23:00:58 +00:00
*/
# if HAVE_CONFIG_H
# include "clamav-config.h"
# endif
# include <stdio.h>
# include <string.h>
# include <stdlib.h>
2006-12-20 15:33:55 +00:00
# include <sys/types.h>
2006-11-15 15:26:54 +00:00
# ifdef HAVE_UNISTD_H
# include <unistd.h>
# endif
2004-07-02 23:00:58 +00:00
# include "clamav.h"
# include "filetypes.h"
2004-07-19 17:54:40 +00:00
# include "others.h"
# include "readdb.h"
2005-11-11 01:31:46 +00:00
# include "matcher-ac.h"
2006-10-25 15:40:47 +00:00
# include "str.h"
2008-01-07 14:20:38 +00:00
# include "textdet.h"
2008-12-29 17:55:30 +00:00
# include "default.h"
2004-07-02 23:00:58 +00:00
2006-12-26 16:17:02 +00:00
# include "htmlnorm.h"
# include "entconv.h"
2008-10-21 23:55:11 +00:00
# include "mpool.h"
2006-12-26 16:17:02 +00:00
2007-12-14 22:39:37 +00:00
static const struct ftmap_s {
const char * name ;
cli_file_t code ;
} ftmap [ ] = {
2008-01-07 14:20:38 +00:00
{ " CL_TYPE_TEXT_ASCII " , CL_TYPE_TEXT_ASCII } ,
{ " CL_TYPE_TEXT_UTF8 " , CL_TYPE_TEXT_UTF8 } ,
{ " CL_TYPE_TEXT_UTF16LE " , CL_TYPE_TEXT_UTF16LE } ,
{ " CL_TYPE_TEXT_UTF16BE " , CL_TYPE_TEXT_UTF16BE } ,
{ " CL_TYPE_BINARY_DATA " , CL_TYPE_BINARY_DATA } ,
2007-12-14 22:39:37 +00:00
{ " CL_TYPE_IGNORED " , CL_TYPE_IGNORED } ,
2008-02-20 22:04:48 +00:00
{ " CL_TYPE_ANY " , 0 } , /* for ft-sigs */
2007-12-14 22:39:37 +00:00
{ " CL_TYPE_MSEXE " , CL_TYPE_MSEXE } ,
{ " CL_TYPE_ELF " , CL_TYPE_ELF } ,
2009-07-08 15:05:22 +02:00
{ " CL_TYPE_MACHO " , CL_TYPE_MACHO } ,
2009-07-14 18:19:54 +02:00
{ " CL_TYPE_MACHO_UNIBIN " , CL_TYPE_MACHO_UNIBIN } ,
2007-12-14 22:39:37 +00:00
{ " CL_TYPE_POSIX_TAR " , CL_TYPE_POSIX_TAR } ,
{ " CL_TYPE_OLD_TAR " , CL_TYPE_OLD_TAR } ,
2009-07-06 16:15:33 +02:00
{ " CL_TYPE_CPIO_OLD " , CL_TYPE_CPIO_OLD } ,
{ " CL_TYPE_CPIO_ODC " , CL_TYPE_CPIO_ODC } ,
{ " CL_TYPE_CPIO_NEWC " , CL_TYPE_CPIO_NEWC } ,
{ " CL_TYPE_CPIO_CRC " , CL_TYPE_CPIO_CRC } ,
2007-12-14 22:39:37 +00:00
{ " CL_TYPE_GZ " , CL_TYPE_GZ } ,
{ " CL_TYPE_ZIP " , CL_TYPE_ZIP } ,
{ " CL_TYPE_BZ " , CL_TYPE_BZ } ,
{ " CL_TYPE_RAR " , CL_TYPE_RAR } ,
{ " CL_TYPE_ARJ " , CL_TYPE_ARJ } ,
{ " CL_TYPE_MSSZDD " , CL_TYPE_MSSZDD } ,
{ " CL_TYPE_MSOLE2 " , CL_TYPE_MSOLE2 } ,
{ " CL_TYPE_MSCAB " , CL_TYPE_MSCAB } ,
{ " CL_TYPE_MSCHM " , CL_TYPE_MSCHM } ,
{ " CL_TYPE_SIS " , CL_TYPE_SIS } ,
{ " CL_TYPE_SCRENC " , CL_TYPE_SCRENC } ,
{ " CL_TYPE_GRAPHICS " , CL_TYPE_GRAPHICS } ,
{ " CL_TYPE_RIFF " , CL_TYPE_RIFF } ,
{ " CL_TYPE_BINHEX " , CL_TYPE_BINHEX } ,
{ " CL_TYPE_TNEF " , CL_TYPE_TNEF } ,
{ " CL_TYPE_CRYPTFF " , CL_TYPE_CRYPTFF } ,
{ " CL_TYPE_PDF " , CL_TYPE_PDF } ,
{ " CL_TYPE_UUENCODED " , CL_TYPE_UUENCODED } ,
{ " CL_TYPE_HTML_UTF16 " , CL_TYPE_HTML_UTF16 } ,
2008-02-04 21:38:34 +00:00
{ " CL_TYPE_SCRIPT " , CL_TYPE_SCRIPT } ,
2007-12-14 22:39:37 +00:00
{ " CL_TYPE_RTF " , CL_TYPE_RTF } ,
{ " CL_TYPE_HTML " , CL_TYPE_HTML } ,
{ " CL_TYPE_MAIL " , CL_TYPE_MAIL } ,
{ " CL_TYPE_SFX " , CL_TYPE_SFX } ,
{ " CL_TYPE_ZIPSFX " , CL_TYPE_ZIPSFX } ,
{ " CL_TYPE_RARSFX " , CL_TYPE_RARSFX } ,
{ " CL_TYPE_CABSFX " , CL_TYPE_CABSFX } ,
{ " CL_TYPE_ARJSFX " , CL_TYPE_ARJSFX } ,
{ " CL_TYPE_NULSFT " , CL_TYPE_NULSFT } ,
{ " CL_TYPE_AUTOIT " , CL_TYPE_AUTOIT } ,
2009-07-13 01:02:13 +02:00
{ " CL_TYPE_ISHIELD_MSI " , CL_TYPE_ISHIELD_MSI } ,
2009-08-06 18:22:46 +02:00
{ " CL_TYPE_7Z " , CL_TYPE_7Z } ,
2008-01-07 14:20:38 +00:00
{ NULL , CL_TYPE_IGNORED }
2004-07-02 23:00:58 +00:00
} ;
2007-12-14 22:39:37 +00:00
cli_file_t cli_ftcode ( const char * name )
{
unsigned int i ;
2004-07-02 23:00:58 +00:00
2007-12-14 22:39:37 +00:00
for ( i = 0 ; ftmap [ i ] . name ; i + + )
if ( ! strcmp ( ftmap [ i ] . name , name ) )
return ftmap [ i ] . code ;
2004-07-02 23:00:58 +00:00
2007-12-14 22:39:37 +00:00
return CL_TYPE_ERROR ;
}
2004-07-02 23:00:58 +00:00
2008-10-19 16:16:49 +00:00
void cli_ftfree ( const struct cl_engine * engine )
2007-12-14 22:39:37 +00:00
{
2008-10-19 16:16:49 +00:00
struct cli_ftype * ftypes = engine - > ftypes , * pt ;
2007-12-14 22:39:37 +00:00
while ( ftypes ) {
pt = ftypes ;
ftypes = ftypes - > next ;
2009-01-26 19:47:02 +00:00
mpool_free ( engine - > mempool , pt - > magic ) ;
mpool_free ( engine - > mempool , pt - > tname ) ;
mpool_free ( engine - > mempool , pt ) ;
2007-12-14 22:39:37 +00:00
}
}
2005-03-29 00:55:06 +00:00
2007-12-14 22:39:37 +00:00
cli_file_t cli_filetype ( const unsigned char * buf , size_t buflen , const struct cl_engine * engine )
2004-07-02 23:00:58 +00:00
{
2007-12-14 22:39:37 +00:00
struct cli_ftype * ftype = engine - > ftypes ;
2004-08-24 00:50:29 +00:00
2004-07-02 23:00:58 +00:00
2007-12-14 22:39:37 +00:00
while ( ftype ) {
if ( ftype - > offset + ftype - > length < = buflen ) {
if ( ! memcmp ( buf + ftype - > offset , ftype - > magic , ftype - > length ) ) {
cli_dbgmsg ( " Recognized %s file \n " , ftype - > tname ) ;
return ftype - > type ;
2004-07-02 23:00:58 +00:00
}
}
2007-12-14 22:39:37 +00:00
ftype = ftype - > next ;
2004-07-02 23:00:58 +00:00
}
2008-01-07 14:20:38 +00:00
return cli_texttype ( buf , buflen ) ;
2004-07-02 23:00:58 +00:00
}
2006-06-17 21:00:44 +00:00
int is_tar ( unsigned char * buf , unsigned int nbytes ) ;
2005-03-22 21:26:27 +00:00
2006-10-25 15:40:47 +00:00
cli_file_t cli_filetype2 ( int desc , const struct cl_engine * engine )
2005-03-22 21:26:27 +00:00
{
2008-05-27 16:30:47 +00:00
unsigned char buff [ MAGIC_BUFFER_SIZE + 1 ] , * decoded ;
2006-10-25 15:40:47 +00:00
int bread , sret ;
2008-01-07 14:20:38 +00:00
cli_file_t ret = CL_TYPE_BINARY_DATA ;
2006-10-25 15:40:47 +00:00
struct cli_matcher * root ;
2006-11-15 15:26:54 +00:00
struct cli_ac_data mdata ;
2005-03-22 21:26:27 +00:00
2008-01-07 14:20:38 +00:00
if ( ! engine ) {
cli_errmsg ( " cli_filetype2: engine == NULL \n " ) ;
return CL_TYPE_ERROR ;
}
2008-05-27 16:30:47 +00:00
memset ( buff , 0 , sizeof ( buff ) ) ;
bread = cli_readn ( desc , buff , MAGIC_BUFFER_SIZE ) ;
2008-02-11 10:21:03 +00:00
if ( bread = = - 1 )
return CL_TYPE_ERROR ;
2008-05-27 16:30:47 +00:00
buff [ bread ] = 0 ;
2008-02-11 10:21:03 +00:00
2008-05-27 16:30:47 +00:00
ret = cli_filetype ( buff , bread , engine ) ;
2005-03-22 21:26:27 +00:00
2008-01-07 14:20:38 +00:00
if ( ret > = CL_TYPE_TEXT_ASCII & & ret < = CL_TYPE_BINARY_DATA ) {
/* HTML files may contain special characters and could be
* misidentified as BINARY_DATA by cli_filetype ( )
*/
2006-10-25 15:40:47 +00:00
root = engine - > root [ 0 ] ;
if ( ! root )
return ret ;
2008-12-29 17:55:30 +00:00
if ( cli_ac_initdata ( & mdata , root - > ac_partsigs , root - > ac_lsigs , CLI_DEFAULT_AC_TRACKLEN ) )
2006-10-25 15:40:47 +00:00
return ret ;
2009-08-14 14:38:13 +02:00
sret = cli_ac_scanbuff ( buff , bread , NULL , NULL , NULL , engine - > root [ 0 ] , & mdata , 0 , ret , NULL , AC_SCAN_FT , NULL ) ;
2006-11-15 15:26:54 +00:00
cli_ac_freedata ( & mdata ) ;
2006-10-25 15:40:47 +00:00
if ( sret > = CL_TYPENO ) {
ret = sret ;
} else {
2008-12-29 17:55:30 +00:00
if ( cli_ac_initdata ( & mdata , root - > ac_partsigs , root - > ac_lsigs , CLI_DEFAULT_AC_TRACKLEN ) )
2006-11-15 15:26:54 +00:00
return ret ;
2008-05-27 16:30:47 +00:00
decoded = ( unsigned char * ) cli_utf16toascii ( ( char * ) buff , bread ) ;
2006-10-25 15:40:47 +00:00
if ( decoded ) {
2009-08-14 14:38:13 +02:00
sret = cli_ac_scanbuff ( decoded , strlen ( ( char * ) decoded ) , NULL , NULL , NULL , engine - > root [ 0 ] , & mdata , 0 , CL_TYPE_TEXT_ASCII , NULL , AC_SCAN_FT , NULL ) ;
2006-10-25 15:40:47 +00:00
free ( decoded ) ;
if ( sret = = CL_TYPE_HTML )
ret = CL_TYPE_HTML_UTF16 ;
}
2006-11-15 15:26:54 +00:00
cli_ac_freedata ( & mdata ) ;
2006-12-26 16:17:02 +00:00
2007-05-01 16:08:57 +00:00
if ( ( ( ( struct cli_dconf * ) engine - > dconf ) - > phishing & PHISHING_CONF_ENTCONV ) & & ret ! = CL_TYPE_HTML_UTF16 ) {
2008-02-01 19:38:52 +00:00
const char * encoding ;
/* check if we can autodetect this encoding.
* If we can ' t don ' t try to detect HTML sig , since
* we just tried that above , and failed */
2008-05-27 16:30:47 +00:00
if ( ( encoding = encoding_detect_bom ( buff , bread ) ) ) {
unsigned char decodedbuff [ sizeof ( buff ) * 2 ] ;
2008-02-01 19:38:52 +00:00
m_area_t in_area , out_area ;
2008-05-27 16:30:47 +00:00
in_area . buffer = ( unsigned char * ) buff ;
2008-02-01 19:38:52 +00:00
in_area . length = bread ;
in_area . offset = 0 ;
out_area . buffer = decodedbuff ;
out_area . length = sizeof ( decodedbuff ) ;
out_area . offset = 0 ;
/* in htmlnorm we simply skip over \0 chars, and that allows to parse HTML in any unicode
* ( multibyte characters will not be exactly handled , but that is not a problem ) .
* However when detecting whether a file is HTML or not , we need exact conversion .
* ( just eliminating zeros and matching would introduce false positives */
if ( encoding_normalize_toascii ( & in_area , encoding , & out_area ) > = 0 & & out_area . length > 0 ) {
2008-12-29 17:55:30 +00:00
if ( cli_ac_initdata ( & mdata , root - > ac_partsigs , root - > ac_lsigs , CLI_DEFAULT_AC_TRACKLEN ) )
2008-02-01 19:38:52 +00:00
return ret ;
if ( out_area . length > 0 ) {
2009-08-14 14:38:13 +02:00
sret = cli_ac_scanbuff ( decodedbuff , out_area . length , NULL , NULL , NULL , engine - > root [ 0 ] , & mdata , 0 , 0 , NULL , AC_SCAN_FT , NULL ) ; /* FIXME: can we use CL_TYPE_TEXT_ASCII instead of 0? */
2008-02-01 19:38:52 +00:00
if ( sret = = CL_TYPE_HTML ) {
cli_dbgmsg ( " cli_filetype2: detected HTML signature in Unicode file \n " ) ;
/* htmlnorm is able to handle any unicode now, since it skips null chars */
ret = CL_TYPE_HTML ;
}
2008-01-20 22:18:14 +00:00
}
2006-12-26 16:17:02 +00:00
2008-02-01 19:38:52 +00:00
cli_ac_freedata ( & mdata ) ;
}
2008-01-20 22:18:14 +00:00
}
2006-12-26 16:17:02 +00:00
}
2006-10-25 15:40:47 +00:00
}
}
2008-01-07 14:20:38 +00:00
if ( ret = = CL_TYPE_BINARY_DATA ) {
2008-05-27 16:30:47 +00:00
switch ( is_tar ( buff , bread ) ) {
case 1 :
ret = CL_TYPE_OLD_TAR ;
cli_dbgmsg ( " Recognized old fashioned tar file \n " ) ;
break ;
case 2 :
ret = CL_TYPE_POSIX_TAR ;
cli_dbgmsg ( " Recognized POSIX tar file \n " ) ;
break ;
2005-03-22 21:26:27 +00:00
}
}
return ret ;
}