2004-07-02 23:00:58 +00:00
/*
2015-09-17 13:41:26 -04:00
* Copyright ( C ) 2015 Cisco Systems , Inc . and / or its affiliates . All rights reserved .
2013-10-17 16:54:21 -04:00
* Copyright ( C ) 2007 - 2013 Sourcefire , Inc .
2007-12-14 22:39:37 +00:00
*
2008-04-02 15:24:51 +00:00
* Authors : Tomasz Kojm
2004-07-02 23:00:58 +00:00
*
* This program is free software ; you can redistribute it and / or modify
2007-03-31 20:31:04 +00:00
* it under the terms of the GNU General Public License version 2 as
* published by the Free Software Foundation .
2004-07-02 23:00:58 +00:00
*
* This program is distributed in the hope that it will be useful ,
* but WITHOUT ANY WARRANTY ; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE . See the
* GNU General Public License for more details .
*
* You should have received a copy of the GNU General Public License
* along with this program ; if not , write to the Free Software
2006-04-09 19:59:28 +00:00
* Foundation , Inc . , 51 Franklin Street , Fifth Floor , Boston ,
* MA 02110 - 1301 , USA .
2004-07-02 23:00:58 +00:00
*/
# if HAVE_CONFIG_H
# include "clamav-config.h"
# endif
# include <stdio.h>
# include <string.h>
# include <stdlib.h>
2006-12-20 15:33:55 +00:00
# include <sys/types.h>
2006-11-15 15:26:54 +00:00
# ifdef HAVE_UNISTD_H
# include <unistd.h>
# endif
2004-07-02 23:00:58 +00:00
# include "clamav.h"
# include "filetypes.h"
2004-07-19 17:54:40 +00:00
# include "others.h"
# include "readdb.h"
2005-11-11 01:31:46 +00:00
# include "matcher-ac.h"
2006-10-25 15:40:47 +00:00
# include "str.h"
2008-01-07 14:20:38 +00:00
# include "textdet.h"
2008-12-29 17:55:30 +00:00
# include "default.h"
2012-09-14 11:42:16 -04:00
# include "iowrap.h"
2014-02-06 19:01:26 -05:00
# include "mbr.h"
2014-02-10 12:00:01 -05:00
# include "gpt.h"
2014-11-24 10:47:44 -05:00
# include "ooxml.h"
2004-07-02 23:00:58 +00:00
2006-12-26 16:17:02 +00:00
# include "htmlnorm.h"
# include "entconv.h"
2008-10-21 23:55:11 +00:00
# include "mpool.h"
2013-10-17 16:54:21 -04:00
# define UNZIP_PRIVATE
# include "unzip.h"
2006-12-26 16:17:02 +00:00
2018-12-03 12:37:58 -05:00
// clang-format off
2007-12-14 22:39:37 +00:00
static const struct ftmap_s {
const char * name ;
cli_file_t code ;
} ftmap [ ] = {
2018-12-03 12:37:58 -05:00
{ " CL_TYPE_TEXT_ASCII " , CL_TYPE_TEXT_ASCII } ,
{ " CL_TYPE_TEXT_UTF8 " , CL_TYPE_TEXT_UTF8 } ,
{ " CL_TYPE_TEXT_UTF16LE " , CL_TYPE_TEXT_UTF16LE } ,
{ " CL_TYPE_TEXT_UTF16BE " , CL_TYPE_TEXT_UTF16BE } ,
{ " CL_TYPE_BINARY_DATA " , CL_TYPE_BINARY_DATA } ,
{ " CL_TYPE_IGNORED " , CL_TYPE_IGNORED } ,
{ " CL_TYPE_ANY " , CL_TYPE_ANY } ,
{ " CL_TYPE_MSEXE " , CL_TYPE_MSEXE } ,
{ " CL_TYPE_ELF " , CL_TYPE_ELF } ,
{ " CL_TYPE_MACHO " , CL_TYPE_MACHO } ,
{ " CL_TYPE_MACHO_UNIBIN " , CL_TYPE_MACHO_UNIBIN } ,
{ " CL_TYPE_POSIX_TAR " , CL_TYPE_POSIX_TAR } ,
{ " CL_TYPE_OLD_TAR " , CL_TYPE_OLD_TAR } ,
{ " CL_TYPE_CPIO_OLD " , CL_TYPE_CPIO_OLD } ,
{ " CL_TYPE_CPIO_ODC " , CL_TYPE_CPIO_ODC } ,
{ " CL_TYPE_CPIO_NEWC " , CL_TYPE_CPIO_NEWC } ,
{ " CL_TYPE_CPIO_CRC " , CL_TYPE_CPIO_CRC } ,
{ " CL_TYPE_GZ " , CL_TYPE_GZ } ,
{ " CL_TYPE_ZIP " , CL_TYPE_ZIP } ,
{ " CL_TYPE_BZ " , CL_TYPE_BZ } ,
{ " CL_TYPE_RAR " , CL_TYPE_RAR } ,
{ " CL_TYPE_ARJ " , CL_TYPE_ARJ } ,
{ " CL_TYPE_MSSZDD " , CL_TYPE_MSSZDD } ,
{ " CL_TYPE_MSOLE2 " , CL_TYPE_MSOLE2 } ,
{ " CL_TYPE_MSCAB " , CL_TYPE_MSCAB } ,
{ " CL_TYPE_MSCHM " , CL_TYPE_MSCHM } ,
{ " CL_TYPE_SIS " , CL_TYPE_SIS } ,
{ " CL_TYPE_SCRENC " , CL_TYPE_SCRENC } ,
{ " CL_TYPE_GRAPHICS " , CL_TYPE_GRAPHICS } ,
{ " CL_TYPE_RIFF " , CL_TYPE_RIFF } ,
{ " CL_TYPE_BINHEX " , CL_TYPE_BINHEX } ,
{ " CL_TYPE_TNEF " , CL_TYPE_TNEF } ,
{ " CL_TYPE_CRYPTFF " , CL_TYPE_CRYPTFF } ,
{ " CL_TYPE_PDF " , CL_TYPE_PDF } ,
{ " CL_TYPE_UUENCODED " , CL_TYPE_UUENCODED } ,
{ " CL_TYPE_HTML_UTF16 " , CL_TYPE_HTML_UTF16 } ,
{ " CL_TYPE_SCRIPT " , CL_TYPE_SCRIPT } ,
{ " CL_TYPE_RTF " , CL_TYPE_RTF } ,
{ " CL_TYPE_HTML " , CL_TYPE_HTML } ,
{ " CL_TYPE_MAIL " , CL_TYPE_MAIL } ,
{ " CL_TYPE_SFX " , CL_TYPE_SFX } ,
{ " CL_TYPE_ZIPSFX " , CL_TYPE_ZIPSFX } ,
{ " CL_TYPE_RARSFX " , CL_TYPE_RARSFX } ,
{ " CL_TYPE_CABSFX " , CL_TYPE_CABSFX } ,
{ " CL_TYPE_ARJSFX " , CL_TYPE_ARJSFX } ,
{ " CL_TYPE_NULSFT " , CL_TYPE_NULSFT } ,
{ " CL_TYPE_AUTOIT " , CL_TYPE_AUTOIT } ,
{ " CL_TYPE_ISHIELD_MSI " , CL_TYPE_ISHIELD_MSI } ,
{ " CL_TYPE_7Z " , CL_TYPE_7Z } ,
{ " CL_TYPE_7ZSFX " , CL_TYPE_7ZSFX } ,
{ " CL_TYPE_SWF " , CL_TYPE_SWF } ,
{ " CL_TYPE_ISO9660 " , CL_TYPE_ISO9660 } ,
{ " CL_TYPE_JAVA " , CL_TYPE_JAVA } ,
{ " CL_TYPE_DMG " , CL_TYPE_DMG } ,
{ " CL_TYPE_MBR " , CL_TYPE_MBR } ,
{ " CL_TYPE_GPT " , CL_TYPE_GPT } ,
{ " CL_TYPE_APM " , CL_TYPE_APM } ,
{ " CL_TYPE_XAR " , CL_TYPE_XAR } ,
{ " CL_TYPE_PART_ANY " , CL_TYPE_PART_ANY } ,
{ " CL_TYPE_PART_HFSPLUS " , CL_TYPE_PART_HFSPLUS } ,
{ " CL_TYPE_XZ " , CL_TYPE_XZ } ,
{ " CL_TYPE_OOXML_WORD " , CL_TYPE_OOXML_WORD } ,
{ " CL_TYPE_OOXML_PPT " , CL_TYPE_OOXML_PPT } ,
{ " CL_TYPE_OOXML_XL " , CL_TYPE_OOXML_XL } ,
{ " CL_TYPE_INTERNAL " , CL_TYPE_INTERNAL } ,
{ " CL_TYPE_XDP " , CL_TYPE_XDP } ,
{ " CL_TYPE_XML_WORD " , CL_TYPE_XML_WORD } ,
{ " CL_TYPE_XML_XL " , CL_TYPE_XML_XL } ,
{ " CL_TYPE_HWP3 " , CL_TYPE_HWP3 } ,
{ " CL_TYPE_XML_HWP " , CL_TYPE_XML_HWP } ,
{ " CL_TYPE_HWPOLE2 " , CL_TYPE_HWPOLE2 } ,
{ " CL_TYPE_OOXML_HWP " , CL_TYPE_OOXML_HWP } ,
{ " CL_TYPE_PS " , CL_TYPE_PS } ,
{ " CL_TYPE_MHTML " , CL_TYPE_MHTML } ,
{ " CL_TYPE_LNK " , CL_TYPE_LNK } ,
{ NULL , CL_TYPE_IGNORED }
2004-07-02 23:00:58 +00:00
} ;
2018-12-03 12:37:58 -05:00
// clang-format on
2004-07-02 23:00:58 +00:00
2014-07-10 18:11:49 -04:00
cli_file_t cli_partitiontype ( const unsigned char * buf , size_t buflen , const struct cl_engine * engine ) ;
2007-12-14 22:39:37 +00:00
cli_file_t cli_ftcode ( const char * name )
{
unsigned int i ;
2004-07-02 23:00:58 +00:00
2007-12-14 22:39:37 +00:00
for ( i = 0 ; ftmap [ i ] . name ; i + + )
if ( ! strcmp ( ftmap [ i ] . name , name ) )
return ftmap [ i ] . code ;
2004-07-02 23:00:58 +00:00
2007-12-14 22:39:37 +00:00
return CL_TYPE_ERROR ;
}
2004-07-02 23:00:58 +00:00
2011-03-04 18:27:32 +01:00
const char * cli_ftname ( cli_file_t code )
{
unsigned int i ;
for ( i = 0 ; ftmap [ i ] . name ; i + + )
if ( ftmap [ i ] . code = = code )
return ftmap [ i ] . name ;
return NULL ;
}
2008-10-19 16:16:49 +00:00
void cli_ftfree ( const struct cl_engine * engine )
2007-12-14 22:39:37 +00:00
{
2008-10-19 16:16:49 +00:00
struct cli_ftype * ftypes = engine - > ftypes , * pt ;
2007-12-14 22:39:37 +00:00
while ( ftypes ) {
pt = ftypes ;
ftypes = ftypes - > next ;
2009-01-26 19:47:02 +00:00
mpool_free ( engine - > mempool , pt - > magic ) ;
mpool_free ( engine - > mempool , pt - > tname ) ;
mpool_free ( engine - > mempool , pt ) ;
2007-12-14 22:39:37 +00:00
}
2013-09-17 16:45:48 -04:00
ftypes = engine - > ptypes ;
while ( ftypes ) {
pt = ftypes ;
ftypes = ftypes - > next ;
mpool_free ( engine - > mempool , pt - > magic ) ;
mpool_free ( engine - > mempool , pt - > tname ) ;
mpool_free ( engine - > mempool , pt ) ;
}
}
cli_file_t cli_partitiontype ( const unsigned char * buf , size_t buflen , const struct cl_engine * engine )
{
struct cli_ftype * ptype = engine - > ptypes ;
while ( ptype ) {
if ( ptype - > offset + ptype - > length < = buflen ) {
if ( ! memcmp ( buf + ptype - > offset , ptype - > magic , ptype - > length ) ) {
cli_dbgmsg ( " Recognized %s partition \n " , ptype - > tname ) ;
return ptype - > type ;
}
}
ptype = ptype - > next ;
}
2014-08-22 14:09:08 -04:00
cli_dbgmsg ( " Partition type is potentially unsupported \n " ) ;
2013-09-17 16:45:48 -04:00
return CL_TYPE_PART_ANY ;
2007-12-14 22:39:37 +00:00
}
2005-03-29 00:55:06 +00:00
2007-12-14 22:39:37 +00:00
cli_file_t cli_filetype ( const unsigned char * buf , size_t buflen , const struct cl_engine * engine )
2004-07-02 23:00:58 +00:00
{
2007-12-14 22:39:37 +00:00
struct cli_ftype * ftype = engine - > ftypes ;
2004-08-24 00:50:29 +00:00
2004-07-02 23:00:58 +00:00
2007-12-14 22:39:37 +00:00
while ( ftype ) {
if ( ftype - > offset + ftype - > length < = buflen ) {
if ( ! memcmp ( buf + ftype - > offset , ftype - > magic , ftype - > length ) ) {
cli_dbgmsg ( " Recognized %s file \n " , ftype - > tname ) ;
return ftype - > type ;
2004-07-02 23:00:58 +00:00
}
}
2007-12-14 22:39:37 +00:00
ftype = ftype - > next ;
2004-07-02 23:00:58 +00:00
}
2008-01-07 14:20:38 +00:00
return cli_texttype ( buf , buflen ) ;
2004-07-02 23:00:58 +00:00
}
2012-01-05 14:16:09 +02:00
int is_tar ( const unsigned char * buf , unsigned int nbytes ) ;
2005-03-22 21:26:27 +00:00
2016-01-13 17:16:44 -05:00
/* organize by length, cannot exceed SIZEOF_LH */
2018-12-03 12:37:58 -05:00
// clang-format off
2016-01-13 14:56:46 -05:00
const struct ooxml_ftcodes {
const char * entry ;
size_t len ;
cli_file_t type ;
} ooxml_detect [ ] = {
{ " xl/ " , 3 , CL_TYPE_OOXML_XL } ,
{ " ppt/ " , 4 , CL_TYPE_OOXML_PPT } ,
{ " word/ " , 5 , CL_TYPE_OOXML_WORD } ,
{ " BinData " , 7 , CL_TYPE_ZIP } , /* HWP */
{ " mimetype " , 8 , CL_TYPE_ZIP } , /* HWP */
{ " Contents " , 8 , CL_TYPE_ZIP } , /* HWP */
{ " docProps/ " , 9 , CL_TYPE_ZIP } , /* MS */
2016-02-16 14:15:18 -05:00
{ " customXml/ " , 10 , CL_TYPE_ZIP } , /* MS */
2016-01-13 14:56:46 -05:00
{ " version.xml " , 11 , CL_TYPE_ZIP } , /* HWP */
{ " settings.xml " , 12 , CL_TYPE_ZIP } , /* HWP */
{ " _.rels/.rels " , 12 , CL_TYPE_ZIP } , /* MS */
{ " [ContentTypes].xml " , 18 , CL_TYPE_ZIP } , /* MS */
{ " [Content_Types].xml " , 19 , CL_TYPE_ZIP } , /* MS */
{ " Preview/PrvText.txt " , 19 , CL_TYPE_ZIP } , /* HWP */
{ " Contents/content.hpf " , 20 , CL_TYPE_OOXML_HWP } ,
{ " META-INF/container.xml " , 22 , CL_TYPE_ZIP } , /* HWP */
{ NULL , 0 , CL_TYPE_ANY }
} ;
2018-12-03 12:37:58 -05:00
// clang-format on
2016-01-13 17:16:44 -05:00
/* set to biggest ooxml_detect len */
# define OOXML_DETECT_MAXLEN 22
2016-01-13 14:56:46 -05:00
# define OOXML_FTIDENTIFIED(type) \
do { \
if ( type ! = CL_TYPE_ZIP ) { \
switch ( type ) { \
case CL_TYPE_OOXML_XL : \
cli_dbgmsg ( " Recognized OOXML XL file \n " ) ; \
return CL_TYPE_OOXML_XL ; \
case CL_TYPE_OOXML_PPT : \
cli_dbgmsg ( " Recognized OOXML PPT file \n " ) ; \
return CL_TYPE_OOXML_PPT ; \
case CL_TYPE_OOXML_WORD : \
cli_dbgmsg ( " Recognized OOXML WORD file \n " ) ; \
return CL_TYPE_OOXML_WORD ; \
case CL_TYPE_OOXML_HWP : \
cli_dbgmsg ( " Recognized OOXML HWP file \n " ) ; \
return CL_TYPE_OOXML_HWP ; \
default : \
cli_dbgmsg ( " unexpected ooxml_filetype return: %i \n " , type ) ; \
} \
} \
} while ( 0 )
2015-12-17 16:16:55 -05:00
2013-09-17 16:45:48 -04:00
cli_file_t cli_filetype2 ( fmap_t * map , const struct cl_engine * engine , cli_file_t basetype )
2005-03-22 21:26:27 +00:00
{
2012-09-14 11:42:16 -04:00
unsigned char buffer [ MAGIC_BUFFER_SIZE ] ;
2012-01-05 14:16:09 +02:00
const unsigned char * buff ;
unsigned char * decoded ;
2013-09-17 16:45:48 -04:00
int bread , sret ;
2008-01-07 14:20:38 +00:00
cli_file_t ret = CL_TYPE_BINARY_DATA ;
2006-10-25 15:40:47 +00:00
struct cli_matcher * root ;
2006-11-15 15:26:54 +00:00
struct cli_ac_data mdata ;
2005-03-22 21:26:27 +00:00
2008-01-07 14:20:38 +00:00
if ( ! engine ) {
cli_errmsg ( " cli_filetype2: engine == NULL \n " ) ;
return CL_TYPE_ERROR ;
}
2013-09-17 16:45:48 -04:00
if ( basetype = = CL_TYPE_PART_ANY ) {
bread = MIN ( map - > len , CL_PART_MBUFF_SIZE ) ;
}
else {
bread = MIN ( map - > len , CL_FILE_MBUFF_SIZE ) ;
}
if ( bread > MAGIC_BUFFER_SIZE ) {
/* Save anyone who tampered with the header */
bread = MAGIC_BUFFER_SIZE ;
}
2009-09-01 13:49:36 +02:00
buff = fmap_need_off_once ( map , 0 , bread ) ;
2012-09-14 11:42:16 -04:00
if ( buff ) {
sret = cli_memcpy ( buffer , buff , bread ) ;
if ( sret ) {
cli_errmsg ( " cli_filetype2: fileread error! \n " ) ;
return CL_TYPE_ERROR ;
}
sret = 0 ;
} else {
return CL_TYPE_ERROR ;
}
2013-09-17 16:45:48 -04:00
if ( basetype = = CL_TYPE_PART_ANY ) { /* typing a partition */
ret = cli_partitiontype ( buff , bread , engine ) ;
}
else { /* typing a file */
ret = cli_filetype ( buff , bread , engine ) ;
if ( ret = = CL_TYPE_BINARY_DATA ) {
switch ( is_tar ( buff , bread ) ) {
case 1 :
cli_dbgmsg ( " Recognized old fashioned tar file \n " ) ;
return CL_TYPE_OLD_TAR ;
case 2 :
cli_dbgmsg ( " Recognized POSIX tar file \n " ) ;
return CL_TYPE_POSIX_TAR ;
}
2014-01-24 18:29:56 -05:00
} else if ( ret = = CL_TYPE_ZIP & & bread > 2 * ( SIZEOF_LH + 5 ) ) {
2013-10-17 16:54:21 -04:00
const char lhdr_magic [ 4 ] = { 0x50 , 0x4b , 0x03 , 0x04 } ;
2014-01-24 18:29:56 -05:00
const unsigned char * zbuff = buff ;
uint32_t zread = bread ;
uint64_t zoff = bread ;
const unsigned char * znamep = buff ;
int32_t zlen = bread ;
int lhc = 0 ;
2016-01-13 14:56:46 -05:00
int zi , i , likely_ooxml = 0 ;
2014-10-14 17:15:18 -04:00
cli_file_t ret2 ;
2014-01-24 18:29:56 -05:00
for ( zi = 0 ; zi < 32 ; zi + + ) {
2014-07-10 18:11:49 -04:00
znamep = ( const unsigned char * ) cli_memstr ( ( const char * ) znamep , zlen , lhdr_magic , 4 ) ;
2014-01-24 18:29:56 -05:00
if ( NULL ! = znamep ) {
znamep + = SIZEOF_LH ;
zlen = zread - ( znamep - zbuff ) ;
2016-01-13 17:16:44 -05:00
if ( zlen > OOXML_DETECT_MAXLEN ) {
2016-01-13 14:56:46 -05:00
for ( i = 0 ; ooxml_detect [ i ] . entry ; i + + ) {
2016-01-13 17:16:44 -05:00
if ( 0 = = memcmp ( znamep , ooxml_detect [ i ] . entry , ooxml_detect [ i ] . len ) ) {
if ( ooxml_detect [ i ] . type ! = CL_TYPE_ZIP ) {
OOXML_FTIDENTIFIED ( ooxml_detect [ i ] . type ) ;
/* returns any unexpected type detection */
return ooxml_detect [ i ] . type ;
2016-01-13 14:56:46 -05:00
}
2016-01-13 17:16:44 -05:00
likely_ooxml = 1 ;
2015-12-17 16:16:55 -05:00
}
}
2016-01-13 17:16:44 -05:00
/* only check first three readable zip headers */
2014-10-14 17:15:18 -04:00
if ( + + lhc > 2 ) {
2016-01-13 17:16:44 -05:00
/* if likely, check full archive */
2014-10-14 17:15:18 -04:00
if ( likely_ooxml ) {
cli_dbgmsg ( " Likely OOXML, checking additional zip headers \n " ) ;
if ( ( ret2 = cli_ooxml_filetype ( NULL , map ) ) ! = CL_SUCCESS ) {
/* either an error or retyping has occurred, return error or just CL_TYPE_ZIP? */
2016-01-13 14:56:46 -05:00
OOXML_FTIDENTIFIED ( ret2 ) ;
2016-01-13 17:16:44 -05:00
/* falls-through to additional filetyping */
2014-10-14 17:15:18 -04:00
}
}
break ;
2014-01-24 18:29:56 -05:00
}
2016-01-13 17:16:44 -05:00
}
else {
znamep = NULL ; /* force to map more */
}
2014-01-24 18:29:56 -05:00
}
if ( znamep = = NULL ) {
if ( map - > len - zoff > SIZEOF_LH ) {
2016-01-13 17:16:44 -05:00
zoff - = SIZEOF_LH + OOXML_DETECT_MAXLEN + 1 ; /* remap for SIZEOF_LH+filelen for header overlap map boundary */
2014-01-24 18:29:56 -05:00
zread = MIN ( MAGIC_BUFFER_SIZE , map - > len - zoff ) ;
zbuff = fmap_need_off_once ( map , zoff , zread ) ;
if ( zbuff = = NULL ) {
cli_dbgmsg ( " cli_filetype2: error mapping data for OOXML check \n " ) ;
return CL_TYPE_ERROR ;
}
zoff + = zread ;
znamep = zbuff ;
zlen = zread ;
}
else {
break ; /* end of data */
2013-10-17 16:54:21 -04:00
}
}
2014-01-24 18:29:56 -05:00
}
2014-02-06 19:01:26 -05:00
} else if ( ret = = CL_TYPE_MBR ) {
2014-03-13 15:25:33 -04:00
/* given filetype sig type 0 */
int iret = cli_mbr_check ( buff , bread , map - > len ) ;
if ( iret = = CL_TYPE_GPT ) {
cli_dbgmsg ( " Recognized GUID Partition Table file \n " ) ;
return CL_TYPE_GPT ;
}
else if ( iret = = CL_CLEAN ) {
return CL_TYPE_MBR ;
2014-02-06 19:01:26 -05:00
}
/* re-detect type */
cli_dbgmsg ( " Recognized binary data \n " ) ;
ret = CL_TYPE_BINARY_DATA ;
2013-10-17 16:54:21 -04:00
}
2010-12-30 15:04:02 +01:00
}
2008-01-07 14:20:38 +00:00
if ( ret > = CL_TYPE_TEXT_ASCII & & ret < = CL_TYPE_BINARY_DATA ) {
/* HTML files may contain special characters and could be
* misidentified as BINARY_DATA by cli_filetype ( )
*/
2006-10-25 15:40:47 +00:00
root = engine - > root [ 0 ] ;
if ( ! root )
return ret ;
2009-08-21 15:55:10 +02:00
if ( cli_ac_initdata ( & mdata , root - > ac_partsigs , root - > ac_lsigs , root - > ac_reloff_num , CLI_DEFAULT_AC_TRACKLEN ) )
2006-10-25 15:40:47 +00:00
return ret ;
2009-08-14 14:38:13 +02:00
sret = cli_ac_scanbuff ( buff , bread , NULL , NULL , NULL , engine - > root [ 0 ] , & mdata , 0 , ret , NULL , AC_SCAN_FT , NULL ) ;
2006-11-15 15:26:54 +00:00
cli_ac_freedata ( & mdata ) ;
2006-10-25 15:40:47 +00:00
if ( sret > = CL_TYPENO ) {
ret = sret ;
} else {
2009-08-21 15:55:10 +02:00
if ( cli_ac_initdata ( & mdata , root - > ac_partsigs , root - > ac_lsigs , root - > ac_reloff_num , CLI_DEFAULT_AC_TRACKLEN ) )
2006-11-15 15:26:54 +00:00
return ret ;
2008-05-27 16:30:47 +00:00
decoded = ( unsigned char * ) cli_utf16toascii ( ( char * ) buff , bread ) ;
2006-10-25 15:40:47 +00:00
if ( decoded ) {
2010-03-05 22:01:48 +01:00
sret = cli_ac_scanbuff ( decoded , bread / 2 , NULL , NULL , NULL , engine - > root [ 0 ] , & mdata , 0 , CL_TYPE_TEXT_ASCII , NULL , AC_SCAN_FT , NULL ) ;
2006-10-25 15:40:47 +00:00
free ( decoded ) ;
if ( sret = = CL_TYPE_HTML )
ret = CL_TYPE_HTML_UTF16 ;
}
2006-11-15 15:26:54 +00:00
cli_ac_freedata ( & mdata ) ;
2006-12-26 16:17:02 +00:00
2007-05-01 16:08:57 +00:00
if ( ( ( ( struct cli_dconf * ) engine - > dconf ) - > phishing & PHISHING_CONF_ENTCONV ) & & ret ! = CL_TYPE_HTML_UTF16 ) {
2008-02-01 19:38:52 +00:00
const char * encoding ;
/* check if we can autodetect this encoding.
* If we can ' t don ' t try to detect HTML sig , since
* we just tried that above , and failed */
2008-05-27 16:30:47 +00:00
if ( ( encoding = encoding_detect_bom ( buff , bread ) ) ) {
2009-10-08 22:50:16 +02:00
unsigned char decodedbuff [ ( MAGIC_BUFFER_SIZE + 1 ) * 2 ] ;
2008-02-01 19:38:52 +00:00
m_area_t in_area , out_area ;
2012-06-13 08:34:12 -04:00
memset ( decodedbuff , 0 , sizeof ( decodedbuff ) ) ;
2008-02-01 19:38:52 +00:00
2008-05-27 16:30:47 +00:00
in_area . buffer = ( unsigned char * ) buff ;
2008-02-01 19:38:52 +00:00
in_area . length = bread ;
in_area . offset = 0 ;
out_area . buffer = decodedbuff ;
out_area . length = sizeof ( decodedbuff ) ;
out_area . offset = 0 ;
2014-05-19 16:48:46 -04:00
/* in htmlnorm we simply skip over \0 chars, allowing HTML parsing in any unicode
2008-02-01 19:38:52 +00:00
* ( multibyte characters will not be exactly handled , but that is not a problem ) .
* However when detecting whether a file is HTML or not , we need exact conversion .
* ( just eliminating zeros and matching would introduce false positives */
if ( encoding_normalize_toascii ( & in_area , encoding , & out_area ) > = 0 & & out_area . length > 0 ) {
2009-08-21 15:55:10 +02:00
if ( cli_ac_initdata ( & mdata , root - > ac_partsigs , root - > ac_lsigs , root - > ac_reloff_num , CLI_DEFAULT_AC_TRACKLEN ) )
2008-02-01 19:38:52 +00:00
return ret ;
if ( out_area . length > 0 ) {
2009-08-14 14:38:13 +02:00
sret = cli_ac_scanbuff ( decodedbuff , out_area . length , NULL , NULL , NULL , engine - > root [ 0 ] , & mdata , 0 , 0 , NULL , AC_SCAN_FT , NULL ) ; /* FIXME: can we use CL_TYPE_TEXT_ASCII instead of 0? */
2008-02-01 19:38:52 +00:00
if ( sret = = CL_TYPE_HTML ) {
cli_dbgmsg ( " cli_filetype2: detected HTML signature in Unicode file \n " ) ;
/* htmlnorm is able to handle any unicode now, since it skips null chars */
ret = CL_TYPE_HTML ;
}
2008-01-20 22:18:14 +00:00
}
2006-12-26 16:17:02 +00:00
2008-02-01 19:38:52 +00:00
cli_ac_freedata ( & mdata ) ;
}
2008-01-20 22:18:14 +00:00
}
2006-12-26 16:17:02 +00:00
}
2006-10-25 15:40:47 +00:00
}
}
2005-03-22 21:26:27 +00:00
return ret ;
}