2005-04-27 12:45:17 +00:00
/*
2025-02-14 10:24:30 -05:00
* Copyright ( C ) 2013 - 2025 Cisco Systems , Inc . and / or its affiliates . All rights reserved .
2019-01-25 10:15:50 -05:00
* Copyright ( C ) 2007 - 2013 Sourcefire , Inc .
2008-04-02 15:24:51 +00:00
*
2010-05-11 11:34:19 +03:00
* Authors : Nigel Horne , Török Edvin
*
* Also based on Matt Olney ' s pdf parser in snort - nrt .
2005-04-27 12:45:17 +00:00
*
* This program is free software ; you can redistribute it and / or modify
2008-04-02 15:24:51 +00:00
* it under the terms of the GNU General Public License version 2 as
* published by the Free Software Foundation .
2005-04-27 12:45:17 +00:00
*
* This program is distributed in the hope that it will be useful ,
* but WITHOUT ANY WARRANTY ; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE . See the
* GNU General Public License for more details .
*
* You should have received a copy of the GNU General Public License
* along with this program ; if not , write to the Free Software
2008-04-02 15:24:51 +00:00
* Foundation , Inc . , 51 Franklin Street , Fifth Floor , Boston ,
* MA 02110 - 1301 , USA .
2007-03-01 11:06:37 +00:00
*
* TODO : Embedded fonts
* TODO : Predictor image handling
2005-04-27 12:45:17 +00:00
*/
# if HAVE_CONFIG_H
# include "clamav-config.h"
# endif
2005-04-30 13:12:28 +00:00
# include <stdio.h>
# include <sys/types.h>
# include <sys/stat.h>
# include <ctype.h>
# include <string.h>
# include <fcntl.h>
# include <stdlib.h>
2005-05-01 11:47:49 +00:00
# include <errno.h>
2018-12-03 12:40:13 -05:00
# ifdef HAVE_LIMITS_H
2007-02-25 00:43:49 +00:00
# include <limits.h>
# endif
2018-12-03 12:40:13 -05:00
# ifdef HAVE_UNISTD_H
2007-06-18 18:57:57 +00:00
# include <unistd.h>
# endif
2005-04-30 13:12:28 +00:00
# include <zlib.h>
2014-05-27 16:33:51 -04:00
# if HAVE_ICONV
2014-05-23 14:06:35 -04:00
# include <iconv.h>
2014-05-27 16:33:51 -04:00
# endif
2014-05-23 14:06:35 -04:00
2017-03-31 12:11:31 -04:00
# ifdef _WIN32
# include <stdint.h>
# endif
2007-02-25 00:43:49 +00:00
# include "clamav.h"
# include "others.h"
2005-12-09 07:39:17 +00:00
# include "pdf.h"
2016-03-31 16:32:26 -04:00
# include "pdfdecode.h"
2008-02-12 11:33:47 +00:00
# include "scanners.h"
2009-08-25 01:21:15 +02:00
# include "fmap.h"
2009-08-24 22:09:38 +02:00
# include "str.h"
2020-04-29 14:19:41 -07:00
# include "entconv.h"
2010-08-02 15:42:58 +03:00
# include "bytecode.h"
# include "bytecode_api.h"
2011-05-07 18:06:06 +03:00
# include "arc4.h"
2011-12-14 15:43:14 +02:00
# include "rijndael.h"
2011-12-23 17:40:22 +02:00
# include "textnorm.h"
2015-04-14 15:53:17 -04:00
# include "conv.h"
2014-04-29 17:27:02 -04:00
# include "json_api.h"
2011-12-28 19:05:57 +02:00
2018-12-03 12:40:13 -05:00
# ifdef CL_DEBUG
2019-03-05 21:15:41 -05:00
/*#define SAVE_TMP
2008-08-21 20:21:43 +00:00
* Save the file being worked on in tmp */
2007-03-01 11:06:37 +00:00
# endif
2019-07-15 15:10:24 -07:00
# define MAX_PDF_OBJECTS (64 * 1024)
2014-06-03 09:46:13 -04:00
struct pdf_struct ;
2018-12-03 12:40:13 -05:00
static const char * pdf_nextlinestart ( const char * ptr , size_t len ) ;
static const char * pdf_nextobject ( const char * ptr , size_t len ) ;
2005-05-21 22:07:14 +00:00
2014-04-29 17:27:02 -04:00
/* PDF statistics callbacks and related */
2014-06-25 14:06:17 -04:00
struct pdfname_action ;
2014-04-16 14:23:16 -04:00
2014-04-29 17:27:02 -04:00
static void pdf_export_json ( struct pdf_struct * ) ;
2014-06-25 14:06:17 -04:00
static void ASCIIHexDecode_cb ( struct pdf_struct * , struct pdf_obj * , struct pdfname_action * ) ;
static void ASCII85Decode_cb ( struct pdf_struct * , struct pdf_obj * , struct pdfname_action * ) ;
static void EmbeddedFile_cb ( struct pdf_struct * , struct pdf_obj * , struct pdfname_action * ) ;
static void FlateDecode_cb ( struct pdf_struct * , struct pdf_obj * , struct pdfname_action * ) ;
static void Image_cb ( struct pdf_struct * , struct pdf_obj * , struct pdfname_action * ) ;
static void LZWDecode_cb ( struct pdf_struct * , struct pdf_obj * , struct pdfname_action * ) ;
static void RunLengthDecode_cb ( struct pdf_struct * , struct pdf_obj * , struct pdfname_action * ) ;
static void CCITTFaxDecode_cb ( struct pdf_struct * , struct pdf_obj * , struct pdfname_action * ) ;
static void JBIG2Decode_cb ( struct pdf_struct * , struct pdf_obj * , struct pdfname_action * ) ;
static void DCTDecode_cb ( struct pdf_struct * , struct pdf_obj * , struct pdfname_action * ) ;
static void JPXDecode_cb ( struct pdf_struct * , struct pdf_obj * , struct pdfname_action * ) ;
static void Crypt_cb ( struct pdf_struct * , struct pdf_obj * , struct pdfname_action * ) ;
static void Standard_cb ( struct pdf_struct * , struct pdf_obj * , struct pdfname_action * ) ;
static void Sig_cb ( struct pdf_struct * , struct pdf_obj * , struct pdfname_action * ) ;
static void JavaScript_cb ( struct pdf_struct * , struct pdf_obj * , struct pdfname_action * ) ;
static void OpenAction_cb ( struct pdf_struct * , struct pdf_obj * , struct pdfname_action * ) ;
static void Launch_cb ( struct pdf_struct * , struct pdf_obj * , struct pdfname_action * ) ;
static void Page_cb ( struct pdf_struct * , struct pdf_obj * , struct pdfname_action * ) ;
static void Author_cb ( struct pdf_struct * , struct pdf_obj * , struct pdfname_action * ) ;
static void Creator_cb ( struct pdf_struct * , struct pdf_obj * , struct pdfname_action * ) ;
static void Producer_cb ( struct pdf_struct * , struct pdf_obj * , struct pdfname_action * ) ;
static void CreationDate_cb ( struct pdf_struct * , struct pdf_obj * , struct pdfname_action * ) ;
static void ModificationDate_cb ( struct pdf_struct * , struct pdf_obj * , struct pdfname_action * ) ;
static void Title_cb ( struct pdf_struct * , struct pdf_obj * , struct pdfname_action * ) ;
static void Subject_cb ( struct pdf_struct * , struct pdf_obj * , struct pdfname_action * ) ;
static void Keywords_cb ( struct pdf_struct * , struct pdf_obj * , struct pdfname_action * ) ;
static void Pages_cb ( struct pdf_struct * , struct pdf_obj * , struct pdfname_action * ) ;
static void Colors_cb ( struct pdf_struct * pdf , struct pdf_obj * obj , struct pdfname_action * act ) ;
2014-06-27 12:43:23 -04:00
static void RichMedia_cb ( struct pdf_struct * pdf , struct pdf_obj * obj , struct pdfname_action * act ) ;
static void AcroForm_cb ( struct pdf_struct * pdf , struct pdf_obj * obj , struct pdfname_action * act ) ;
static void XFA_cb ( struct pdf_struct * pdf , struct pdf_obj * obj , struct pdfname_action * act ) ;
2025-04-07 16:50:09 -07:00
static void URI_cb ( struct pdf_struct * pdf , struct pdf_obj * obj , struct pdfname_action * act ) ;
2024-03-25 13:01:46 -04:00
2014-04-29 17:27:02 -04:00
/* End PDF statistics callbacks and related */
2014-04-16 14:23:16 -04:00
2018-08-14 14:00:31 -07:00
static int pdf_readint ( const char * q0 , int len , const char * key ) ;
2018-12-03 12:40:13 -05:00
static const char * pdf_getdict ( const char * q0 , int * len , const char * key ) ;
2018-08-14 14:00:31 -07:00
static char * pdf_readval ( const char * q , int len , const char * key ) ;
2023-11-03 15:00:46 -04:00
static char * pdf_readstring ( const char * q0 , int len , const char * key , unsigned * slen , const char * * qend , bool noescape ) ;
2018-08-14 14:00:31 -07:00
2010-05-10 11:57:44 +03:00
static int xrefCheck ( const char * xref , const char * eof )
{
const char * q ;
2014-04-07 16:39:54 -04:00
2010-05-11 13:33:07 +03:00
while ( xref < eof & & ( * xref = = ' ' | | * xref = = ' \n ' | | * xref = = ' \r ' ) )
2014-04-07 16:39:54 -04:00
xref + + ;
2010-05-10 11:57:44 +03:00
if ( xref + 4 > = eof )
2014-04-07 16:39:54 -04:00
return - 1 ;
2019-03-05 21:15:41 -05:00
if ( ! memcmp ( xref , " xref " , strlen ( " xref " ) ) ) {
2014-04-07 16:39:54 -04:00
cli_dbgmsg ( " cli_pdf: found xref \n " ) ;
return 0 ;
2010-05-10 11:57:44 +03:00
}
2014-04-07 16:39:54 -04:00
2010-05-10 11:57:44 +03:00
/* could be xref stream */
2018-12-03 12:40:13 -05:00
for ( q = xref ; q + 5 < eof ; q + + ) {
2019-03-05 21:15:41 -05:00
if ( ! memcmp ( q , " /XRef " , strlen ( " /XRef " ) ) ) {
2014-04-07 16:39:54 -04:00
cli_dbgmsg ( " cli_pdf: found /XRef \n " ) ;
return 0 ;
}
2010-05-10 11:57:44 +03:00
}
2014-04-07 16:39:54 -04:00
2010-05-10 11:57:44 +03:00
return - 1 ;
}
2011-12-28 19:05:57 +02:00
/* define this to be noisy about things that we can't parse properly */
2017-08-24 16:33:33 -04:00
# undef NOISY
2011-12-28 19:05:57 +02:00
# ifdef NOISY
# define noisy_msg(pdf, ...) cli_infomsg(pdf->ctx, __VA_ARGS__)
2017-08-16 17:31:45 -04:00
# define noisy_warnmsg(...) cli_warnmsg(__VA_ARGS__)
2011-12-28 19:05:57 +02:00
# else
2017-08-16 17:31:45 -04:00
# define noisy_msg(pdf, ...)
# define noisy_warnmsg(...)
2011-12-28 19:05:57 +02:00
# endif
2018-08-14 14:00:31 -07:00
/**
* @ brief Searching BACKwards , find the next character that is not a whitespace .
2019-03-05 21:15:41 -05:00
*
2018-08-14 14:00:31 -07:00
* @ param q Index to start from ( at the end of the search space )
2019-03-05 21:15:41 -05:00
* @ param start Beginning of the search space .
*
2018-08-14 14:00:31 -07:00
* @ return const char * Address of the final non - whitespace character OR the same address as the start .
*/
2010-05-10 11:57:44 +03:00
static const char * findNextNonWSBack ( const char * q , const char * start )
{
2014-04-07 16:39:54 -04:00
while ( q > start & & ( * q = = 0 | | * q = = 9 | | * q = = 0xa | | * q = = 0xc | | * q = = 0xd | | * q = = 0x20 ) )
q - - ;
2010-05-10 11:57:44 +03:00
return q ;
}
2018-08-14 14:00:31 -07:00
/**
* @ brief Searching FORwards , find the next character that is not a whitespace .
2019-03-05 21:15:41 -05:00
*
2018-08-14 14:00:31 -07:00
* @ param q Index to start from ( at the end of the search space )
2020-07-01 16:56:26 -07:00
* @ param end End of the search space .
2019-03-05 21:15:41 -05:00
*
2018-08-14 14:00:31 -07:00
* @ return const char * Address of the final non - whitespace character OR the same address as the start .
*/
static const char * findNextNonWS ( const char * q , const char * end )
{
while ( q < end & & ( * q = = 0 | | * q = = 9 | | * q = = 0xa | | * q = = 0xc | | * q = = 0xd | | * q = = 0x20 ) )
q + + ;
return q ;
}
/**
* @ brief Find bounds of stream .
2019-03-05 21:15:41 -05:00
*
2018-08-14 14:00:31 -07:00
* PDF streams are prefixed with " stream " and suffixed with " endstream " .
* Return value indicates success or failure .
2019-03-05 21:15:41 -05:00
*
2018-08-14 14:00:31 -07:00
* @ param start start address of search space .
2019-03-05 21:15:41 -05:00
* @ param size size of search space
2018-08-14 14:00:31 -07:00
* @ param [ out ] stream output param , address of start of stream data
2019-03-05 21:15:41 -05:00
* @ param [ out ] stream_size output param , size of stream data
2018-08-14 14:00:31 -07:00
* @ param newline_hack hack to support newlines that are \ r \ n , and not just \ n or just \ r .
2019-03-05 21:15:41 -05:00
*
* @ return cl_error_t CL_SUCCESS if stream bounds were found .
* @ return cl_error_t CL_BREAK if stream bounds could not be found .
* @ return cl_error_t CL_EFORMAT if stream start was found , but not end . ( truncated )
* @ return cl_error_t CL_EARG if invalid args were provided .
2018-08-14 14:00:31 -07:00
*/
2019-03-05 21:15:41 -05:00
static cl_error_t find_stream_bounds (
2018-12-03 12:40:13 -05:00
const char * start ,
2019-03-05 21:15:41 -05:00
size_t size ,
const char * * stream ,
size_t * stream_size ,
2018-08-14 14:00:31 -07:00
int newline_hack )
2010-05-11 10:37:10 +03:00
{
2019-03-05 21:15:41 -05:00
cl_error_t status = CL_BREAK ;
const char * idx ;
const char * stream_begin ;
const char * endstream_begin ;
size_t bytesleft = size ;
if ( ( NULL = = start ) | | ( 0 = = bytesleft ) | | ( NULL = = stream ) | | ( NULL = = stream_size ) ) {
status = CL_EARG ;
return status ;
}
* stream = NULL ;
* stream_size = 0 ;
2018-08-14 14:00:31 -07:00
/* Begin by finding the "stream" string that prefixes stream data. */
2019-03-05 21:15:41 -05:00
if ( ( stream_begin = cli_memstr ( start , bytesleft , " stream " , strlen ( " stream " ) ) ) ) {
idx = stream_begin + strlen ( " stream " ) ;
2021-04-06 10:05:10 -07:00
if ( ( size_t ) ( idx - start ) > = bytesleft )
2019-03-05 21:15:41 -05:00
goto done ;
2021-04-06 10:05:10 -07:00
bytesleft - = idx - start ;
2014-04-07 16:39:54 -04:00
2023-11-26 15:01:19 -08:00
/* Skip any new line characters. */
2019-03-05 21:15:41 -05:00
if ( bytesleft > = 2 & & idx [ 0 ] = = ' \xd ' & & idx [ 1 ] = = ' \xa ' ) {
idx + = 2 ;
2021-04-06 10:05:10 -07:00
bytesleft - = 2 ;
if ( newline_hack & & ( bytesleft > 2 ) & & idx [ 0 ] = = ' \xa ' ) {
2019-03-05 21:15:41 -05:00
idx + + ;
2021-04-06 10:05:10 -07:00
bytesleft - - ;
}
2019-03-05 21:15:41 -05:00
} else if ( bytesleft & & idx [ 0 ] = = ' \xa ' ) {
idx + + ;
2021-04-06 10:05:10 -07:00
bytesleft - - ;
2014-04-07 16:39:54 -04:00
}
2019-03-05 21:15:41 -05:00
/* Pass back start of the stream data. */
* stream = idx ;
2018-08-14 14:00:31 -07:00
2019-03-05 21:15:41 -05:00
/* Now find the "endstream" string that suffixes stream data. */
endstream_begin = cli_memstr ( idx , bytesleft , " endstream " , strlen ( " endstream " ) ) ;
if ( ! endstream_begin ) {
/* Couldn't find "endstream", but that's ok --
* - - we ' ll just count the rest of the provided buffer . */
cli_dbgmsg ( " find_stream_bounds: Truncated stream found! \n " ) ;
endstream_begin = start + size ;
status = CL_EFORMAT ;
2018-08-14 14:00:31 -07:00
}
2014-04-07 16:39:54 -04:00
2019-03-05 21:15:41 -05:00
/* Pass back end of the stream data, as offset from start. */
* stream_size = endstream_begin - * stream ;
2018-08-14 14:00:31 -07:00
2019-03-05 21:15:41 -05:00
if ( CL_EFORMAT ! = status )
status = CL_SUCCESS ;
2010-05-11 10:37:10 +03:00
}
2014-04-07 16:39:54 -04:00
2019-03-05 21:15:41 -05:00
done :
return status ;
2010-05-11 10:37:10 +03:00
}
2018-06-09 09:42:57 -04:00
/**
2019-03-05 21:15:41 -05:00
* @ brief Find the next * indirect * object in an object stream , adds it to our list of
2018-08-14 14:00:31 -07:00
* objects , and increments nobj .
2019-03-05 21:15:41 -05:00
*
2018-08-14 14:00:31 -07:00
* Indirect objects in a stream DON ' T begin with " obj " and end with " endobj " .
2020-07-01 16:56:26 -07:00
* Instead , they have an objid and an offset from the first object to point you
2018-08-14 14:00:31 -07:00
* right at them .
2019-03-05 21:15:41 -05:00
*
2020-07-01 16:56:26 -07:00
* If found , objstm - > current will be updated to the next objid .
2019-03-05 21:15:41 -05:00
*
* All objects in an object stream are indirect and thus do not begin or start
* with " obj " or " endobj " . Instead , the object stream takes the following
2018-08-14 14:00:31 -07:00
* format .
2019-03-05 21:15:41 -05:00
*
2018-08-14 14:00:31 -07:00
* < dictionary describing stream > objstm content endobjstm
2019-03-05 21:15:41 -05:00
*
2018-08-14 14:00:31 -07:00
* where content looks something like the following :
2019-03-05 21:15:41 -05:00
*
2018-08-14 14:00:31 -07:00
* 15 0 16 3 17 46 ( ab ) < < / IDS 8 0 R / JavaScript 27 0 R / URLS 9 0 R > > < < / Names [ ( Test ) 28 0 R ] > >
2019-03-05 21:15:41 -05:00
*
* In the above example , the literal string ( ab ) is indirect object # 15 , and
* begins at offset 0 of the set of objects . The next object , # 16 begis at
* offset 3 is a dictionary . The final object is also a dictionary , beginning
2018-08-14 14:00:31 -07:00
* at offset 46.
2019-03-05 21:15:41 -05:00
*
* @ param pdf Pdf struct that keeps track of all information found in the PDF .
2018-08-14 14:00:31 -07:00
* @ param objstm
2019-03-05 21:15:41 -05:00
*
2018-08-14 14:00:31 -07:00
* @ return CL_SUCCESS if success
* @ return CL_EPARSE if parsing error
* @ return CL_EMEM if error allocating memory
* @ return CL_EARG if invalid arguments
*/
int pdf_findobj_in_objstm ( struct pdf_struct * pdf , struct objstm_struct * objstm , struct pdf_obj * * obj_found )
{
2018-12-03 12:40:13 -05:00
cl_error_t status = CL_EPARSE ;
2018-08-14 14:00:31 -07:00
struct pdf_obj * obj = NULL ;
2019-03-05 21:15:41 -05:00
unsigned long objid = 0 , objoff = 0 ;
2019-01-22 14:15:46 -05:00
long temp_long = 0 ;
2018-12-03 12:40:13 -05:00
const char * index = NULL ;
2018-08-14 14:00:31 -07:00
size_t bytes_remaining = 0 ;
if ( NULL = = pdf | | NULL = = objstm ) {
cli_warnmsg ( " pdf_findobj_in_objstm: invalid arguments \n " ) ;
return CL_EARG ;
}
2019-07-15 15:10:24 -07:00
if ( pdf - > nobjs > = MAX_PDF_OBJECTS ) {
pdf - > flags | = 1 < < BAD_PDF_TOOMANYOBJS ;
cli_dbgmsg ( " pdf_findobj_in_objstm: reached object maximum \n " ) ;
status = CL_BREAK ;
goto done ;
}
2018-08-14 14:00:31 -07:00
* obj_found = NULL ;
2018-12-03 12:40:13 -05:00
index = objstm - > streambuf + objstm - > current_pair ;
2018-08-14 14:00:31 -07:00
bytes_remaining = objstm - > streambuf_len - objstm - > current_pair ;
obj = calloc ( sizeof ( struct pdf_obj ) , 1 ) ;
if ( ! obj ) {
cli_warnmsg ( " pdf_findobj_in_objstm: out of memory finding objects in stream \n " ) ;
status = CL_EMEM ;
goto done ;
}
/* This object is in a stream, not in the regular map buffer. */
obj - > objstm = objstm ;
2020-07-01 16:56:26 -07:00
/* objstm->current_pair points directly to the objid */
2019-01-22 14:15:46 -05:00
if ( CL_SUCCESS ! = cli_strntol_wrap ( index , bytes_remaining , 0 , 10 , & temp_long ) ) {
2018-08-14 14:00:31 -07:00
/* Failed to find objid */
cli_dbgmsg ( " pdf_findobj_in_objstm: Failed to find objid for obj in object stream \n " ) ;
status = CL_EPARSE ;
goto done ;
2019-01-22 14:15:46 -05:00
} else if ( temp_long < 0 ) {
cli_dbgmsg ( " pdf_findobj_in_objstm: Encountered invalid negative objid (%ld). \n " , temp_long ) ;
status = CL_EPARSE ;
goto done ;
2018-08-14 14:00:31 -07:00
}
2019-01-22 14:15:46 -05:00
objid = ( unsigned long ) temp_long ;
2018-08-14 14:00:31 -07:00
2020-07-01 16:56:26 -07:00
/* Find the obj offset that appears just after the objid*/
2018-08-14 14:00:31 -07:00
while ( ( index < objstm - > streambuf + objstm - > streambuf_len ) & & isdigit ( * index ) ) {
index + + ;
bytes_remaining - - ;
}
2018-12-03 12:40:13 -05:00
index = findNextNonWS ( index , objstm - > streambuf + objstm - > first ) ;
2018-08-14 14:00:31 -07:00
bytes_remaining = objstm - > streambuf + objstm - > streambuf_len - index ;
2019-01-22 14:15:46 -05:00
if ( CL_SUCCESS ! = cli_strntol_wrap ( index , bytes_remaining , 0 , 10 , & temp_long ) ) {
2018-08-14 14:00:31 -07:00
/* Failed to find obj offset */
cli_dbgmsg ( " pdf_findobj_in_objstm: Failed to find obj offset for obj in object stream \n " ) ;
status = CL_EPARSE ;
goto done ;
2019-01-22 14:15:46 -05:00
} else if ( temp_long < 0 ) {
cli_dbgmsg ( " pdf_findobj_in_objstm: Encountered invalid negative obj offset (%ld). \n " , temp_long ) ;
status = CL_EPARSE ;
goto done ;
2018-08-14 14:00:31 -07:00
}
2019-01-22 14:15:46 -05:00
objoff = ( unsigned long ) temp_long ;
2018-08-14 14:00:31 -07:00
2019-01-22 13:53:29 -05:00
if ( ( size_t ) objstm - > first + ( size_t ) objoff > objstm - > streambuf_len ) {
/* Alleged obj location is further than the length of the stream */
cli_dbgmsg ( " pdf_findobj_in_objstm: obj offset found is greater than the length of the stream. \n " ) ;
status = CL_EPARSE ;
goto done ;
}
2018-08-14 14:00:31 -07:00
objstm - > current = objstm - > first + objoff ;
2018-12-03 12:40:13 -05:00
obj - > id = ( objid < < 8 ) | ( 0 & 0xff ) ;
2018-08-14 14:00:31 -07:00
obj - > start = objstm - > current ;
obj - > flags = 0 ;
objstm - > nobjs_found + + ;
while ( ( index < objstm - > streambuf + objstm - > streambuf_len ) & & isdigit ( * index ) ) {
index + + ;
bytes_remaining - - ;
}
objstm - > current_pair = ( uint32_t ) ( findNextNonWS ( index , objstm - > streambuf + objstm - > first ) - objstm - > streambuf ) ;
/* Update current_pair, if there are more */
if ( ( objstm - > nobjs_found < objstm - > n ) & &
2018-12-03 12:40:13 -05:00
( index < objstm - > streambuf + objstm - > streambuf_len ) ) {
2021-04-06 10:05:10 -07:00
unsigned long next_objoff = 0 ;
2018-08-14 14:00:31 -07:00
2019-03-05 21:15:41 -05:00
/*
* While we ' re at it ,
2018-08-14 14:00:31 -07:00
* lets record the size as running up to the next object offset .
2019-03-05 21:15:41 -05:00
*
2018-08-14 14:00:31 -07:00
* To do so , we will need to parse the next obj pair .
*/
2020-07-01 16:56:26 -07:00
/* objstm->current_pair points directly to the objid */
2018-12-03 12:40:13 -05:00
index = objstm - > streambuf + objstm - > current_pair ;
2018-08-14 14:00:31 -07:00
bytes_remaining = objstm - > streambuf + objstm - > streambuf_len - index ;
2021-04-06 10:05:10 -07:00
/* We don't actually care about the object id at this point, so reading the object id is commented out.
I didn ' t delete it entirely in case the object id is needed in the future . */
// if (CL_SUCCESS != cli_strntol_wrap(index, bytes_remaining, 0, 10, &temp_long)) {
// /* Failed to find objid for next obj */
// cli_dbgmsg("pdf_findobj_in_objstm: Failed to find next objid for obj in object stream though there should be {%u} more.\n", objstm->n - objstm->nobjs_found);
// status = CL_EPARSE;
// goto done;
// } else if (temp_long < 0) {
// cli_dbgmsg("pdf_findobj_in_objstm: Encountered invalid negative objid (%ld).\n", temp_long);
// status = CL_EPARSE;
// goto done;
// }
// next_objid = (unsigned long)temp_long;
2018-08-14 14:00:31 -07:00
2020-07-01 16:56:26 -07:00
/* Find the obj offset that appears just after the objid*/
2018-08-14 14:00:31 -07:00
while ( ( index < objstm - > streambuf + objstm - > streambuf_len ) & & isdigit ( * index ) ) {
index + + ;
bytes_remaining - - ;
}
2018-12-03 12:40:13 -05:00
index = findNextNonWS ( index , objstm - > streambuf + objstm - > first ) ;
2018-08-14 14:00:31 -07:00
bytes_remaining = objstm - > streambuf + objstm - > streambuf_len - index ;
2019-01-22 14:15:46 -05:00
if ( CL_SUCCESS ! = cli_strntol_wrap ( index , bytes_remaining , 0 , 10 , & temp_long ) ) {
2018-08-14 14:00:31 -07:00
/* Failed to find obj offset for next obj */
2025-06-30 10:47:02 -04:00
cli_dbgmsg ( " pdf_findobj_in_objstm: Failed to find next obj offset for obj in object stream though there should be {%zu} more. \n " , objstm - > n - objstm - > nobjs_found ) ;
2018-08-14 14:00:31 -07:00
status = CL_EPARSE ;
goto done ;
2019-01-22 14:15:46 -05:00
} else if ( temp_long < 0 ) {
cli_dbgmsg ( " pdf_findobj_in_objstm: Encountered invalid negative obj offset (%ld). \n " , temp_long ) ;
status = CL_EPARSE ;
goto done ;
}
next_objoff = ( unsigned long ) temp_long ;
if ( next_objoff < = objoff ) {
2018-10-01 19:46:23 -04:00
/* Failed to find obj offset for next obj */
cli_dbgmsg ( " pdf_findobj_in_objstm: Found next obj offset for obj in object stream but it's less than or equal to the current one! \n " ) ;
status = CL_EPARSE ;
goto done ;
2018-12-03 12:40:13 -05:00
} else if ( objstm - > first + next_objoff > objstm - > streambuf_len ) {
2018-10-01 19:46:23 -04:00
/* Failed to find obj offset for next obj */
cli_dbgmsg ( " pdf_findobj_in_objstm: Found next obj offset for obj in object stream but it's further out than the size of the stream! \n " ) ;
status = CL_EPARSE ;
goto done ;
}
2018-08-14 14:00:31 -07:00
2018-10-01 19:46:23 -04:00
obj - > size = next_objoff - objoff ;
2018-12-03 12:40:13 -05:00
} else {
2018-08-14 14:00:31 -07:00
/*
* Should be no more objects . We should verify .
2019-03-05 21:15:41 -05:00
*
2018-08-14 14:00:31 -07:00
* Either way . . .
2019-03-05 21:15:41 -05:00
* obj - > size should be the rest of the buffer .
2018-08-14 14:00:31 -07:00
*/
if ( objstm - > nobjs_found < objstm - > n ) {
cli_warnmsg ( " pdf_findobj_in_objstm: Fewer objects found in object stream than expected! \n " ) ;
}
obj - > size = objstm - > streambuf_len - obj - > start ;
}
/* Success! Add the object to the list of all objects found. */
pdf - > nobjs + + ;
2024-01-09 17:44:33 -05:00
CLI_MAX_REALLOC_OR_GOTO_DONE ( pdf - > objs , sizeof ( struct pdf_obj * ) * pdf - > nobjs ,
2024-01-09 19:41:17 -05:00
cli_warnmsg ( " pdf_findobj_in_objstm: out of memory finding objects in stream \n " ) ,
status = CL_EMEM ) ;
2018-12-03 12:40:13 -05:00
pdf - > objs [ pdf - > nobjs - 1 ] = obj ;
2018-08-14 14:00:31 -07:00
* obj_found = obj ;
status = CL_SUCCESS ;
done :
if ( CL_SUCCESS ! = status ) {
if ( NULL ! = obj ) {
free ( obj ) ;
}
}
return status ;
}
/**
* @ brief Find the next * indirect * object .
2019-03-05 21:15:41 -05:00
*
* Indirect objects located outside of an object stream are prefaced with :
* < objid > < genid > obj
*
* Each of the above are separated by whitespace of some sort .
*
* Indirect objects are postfaced with :
* endobj
*
* The specification does not say if whitespace is required before or after " endobj " .
*
* Identify truncated objects .
*
2018-08-14 14:00:31 -07:00
* If found , pdf - > offset will be updated to just after the " endobj " .
* If truncated , pdf - > offset will = = pdf - > size .
* If not found , pdf - > offset will not be updated .
2019-03-05 21:15:41 -05:00
*
* @ param pdf Pdf context struct that keeps track of all information found in the PDF .
*
2018-08-14 14:00:31 -07:00
* @ return CL_SUCCESS if success
* @ return CL_BREAK if no more objects
* @ return CL_EPARSE if parsing error
* @ return CL_EMEM if error allocating memory
2018-06-09 09:42:57 -04:00
*/
2018-08-14 14:00:31 -07:00
cl_error_t pdf_findobj ( struct pdf_struct * pdf )
2010-05-10 11:57:44 +03:00
{
2018-08-14 14:00:31 -07:00
cl_error_t status = CL_EPARSE ;
2019-03-05 21:15:41 -05:00
const char * start , * idx , * genid_search_index , * objid_search_index ;
const char * obj_begin = NULL , * obj_end = NULL ;
const char * endobj_begin = NULL , * endobj_end = NULL ;
2018-08-14 14:00:31 -07:00
struct pdf_obj * obj = NULL ;
2019-03-05 21:15:41 -05:00
size_t bytesleft ;
2018-06-02 20:58:35 -04:00
unsigned long genid , objid ;
2019-01-22 14:15:46 -05:00
long temp_long ;
2010-05-10 11:57:44 +03:00
2019-07-15 15:10:24 -07:00
if ( pdf - > nobjs > = MAX_PDF_OBJECTS ) {
pdf - > flags | = 1 < < BAD_PDF_TOOMANYOBJS ;
cli_dbgmsg ( " pdf_findobj: reached object maximum \n " ) ;
status = CL_BREAK ;
goto done ;
}
2010-05-10 11:57:44 +03:00
pdf - > nobjs + + ;
2024-01-09 17:44:33 -05:00
CLI_MAX_REALLOC_OR_GOTO_DONE ( pdf - > objs , sizeof ( struct pdf_obj * ) * pdf - > nobjs , status = CL_EMEM ) ;
2018-08-14 14:00:31 -07:00
obj = malloc ( sizeof ( struct pdf_obj ) ) ;
if ( ! obj ) {
status = CL_EMEM ;
goto done ;
2010-05-10 11:57:44 +03:00
}
2018-12-03 12:40:13 -05:00
pdf - > objs [ pdf - > nobjs - 1 ] = obj ;
2014-04-07 16:39:54 -04:00
2010-05-11 11:26:35 +03:00
memset ( obj , 0 , sizeof ( * obj ) ) ;
2018-08-14 14:00:31 -07:00
2018-12-03 12:40:13 -05:00
start = pdf - > map + pdf - > offset ;
2010-05-10 11:57:44 +03:00
bytesleft = pdf - > size - pdf - > offset ;
2018-08-14 14:00:31 -07:00
2019-03-05 21:15:41 -05:00
/*
* Start by searching for " obj "
*/
idx = start + 1 ;
while ( bytesleft > 1 + strlen ( " obj " ) ) {
/* `- 1` accounts for size of white space before obj */
idx = cli_memstr ( idx , bytesleft - 1 , " obj " , strlen ( " obj " ) ) ;
if ( NULL = = idx ) {
status = CL_BREAK ;
goto done ; /* No more objs. */
2018-08-14 14:00:31 -07:00
}
2014-04-07 16:39:54 -04:00
2019-03-05 21:15:41 -05:00
/* verify that the word has a whitespace before it, and is not the end of
* a previous word */
idx - - ;
bytesleft = ( pdf - > size - pdf - > offset ) - ( size_t ) ( idx - start ) ;
2018-08-14 14:00:31 -07:00
2019-03-05 21:15:41 -05:00
if ( * idx ! = 0 & & * idx ! = 9 & & * idx ! = 0xa & & * idx ! = 0xc & & * idx ! = 0xd & & * idx ! = 0x20 ) {
/* This instance of "obj" appears to be part of a longer string.
2018-08-14 14:00:31 -07:00
* Skip it , and keep searching for an object . */
2019-03-05 21:15:41 -05:00
idx + = 1 + strlen ( " obj " ) ;
bytesleft - = 1 + strlen ( " obj " ) ;
2014-04-07 16:39:54 -04:00
continue ;
}
2019-03-05 21:15:41 -05:00
/* Found the beginning of the word */
obj_begin = idx ;
obj_end = idx + 1 + strlen ( " obj " ) ;
2014-04-07 16:39:54 -04:00
2019-03-05 21:15:41 -05:00
break ;
2018-08-14 14:00:31 -07:00
}
2019-03-05 21:15:41 -05:00
if ( ( NULL = = obj_begin ) | | ( NULL = = obj_end ) ) {
status = CL_BREAK ;
goto done ; /* No more objs. */
}
2010-08-31 10:53:29 +03:00
2018-08-14 14:00:31 -07:00
/* Find the generation id (genid) that appears before the "obj" */
2019-03-05 21:15:41 -05:00
genid_search_index = findNextNonWSBack ( obj_begin - 1 , start ) ;
while ( genid_search_index > start & & isdigit ( * genid_search_index ) )
genid_search_index - - ;
2014-04-07 16:39:54 -04:00
2019-03-05 21:15:41 -05:00
if ( CL_SUCCESS ! = cli_strntol_wrap ( genid_search_index , ( size_t ) ( ( obj_begin ) - genid_search_index ) , 0 , 10 , & temp_long ) ) {
2018-08-14 14:00:31 -07:00
cli_dbgmsg ( " pdf_findobj: Failed to parse object genid (# objects found: %u) \n " , pdf - > nobjs ) ;
2018-06-09 09:42:57 -04:00
/* Failed to parse, probably not a real object. Skip past the "obj" thing, and continue. */
2019-03-05 21:15:41 -05:00
pdf - > offset = obj_end - pdf - > map ;
2018-12-03 12:40:13 -05:00
status = CL_EPARSE ;
2018-08-14 14:00:31 -07:00
goto done ;
2019-01-22 14:15:46 -05:00
} else if ( temp_long < 0 ) {
cli_dbgmsg ( " pdf_findobj: Encountered invalid negative obj genid (%ld). \n " , temp_long ) ;
2019-03-05 21:15:41 -05:00
pdf - > offset = obj_end - pdf - > map ;
2019-01-22 14:15:46 -05:00
status = CL_EPARSE ;
goto done ;
2018-06-02 20:58:35 -04:00
}
2019-01-22 14:15:46 -05:00
genid = ( unsigned long ) temp_long ;
2018-08-14 14:00:31 -07:00
2019-03-05 21:15:41 -05:00
/* Find the object id (objid) that appears before the genid */
objid_search_index = findNextNonWSBack ( genid_search_index - 1 , start ) ;
while ( objid_search_index > start & & isdigit ( * objid_search_index ) )
objid_search_index - - ;
2014-04-07 16:39:54 -04:00
2019-03-05 21:15:41 -05:00
if ( CL_SUCCESS ! = cli_strntol_wrap ( objid_search_index , ( size_t ) ( ( genid_search_index ) - objid_search_index ) , 0 , 10 , & temp_long ) ) {
2018-06-02 20:58:35 -04:00
/*
2019-03-05 21:15:41 -05:00
* Edge case :
*
* PDFs with multiple revisions will have % % EOF before the end of the file ,
* followed by the next revision of the PDF , which will probably be an immediate objid .
*
* Example :
* % % EOF1 1 obj < blah > endobj
*
* If this is the case , we can detect it and continue parsing after the % % EOF .
2018-06-02 20:58:35 -04:00
*/
2024-03-20 16:02:21 -04:00
if ( objid_search_index - strlen ( " %%EO " ) > start ) {
const char * lastfile = objid_search_index - strlen ( " %%EO " ) ;
if ( 0 ! = strncmp ( lastfile , " %%EOF " , 5 ) ) {
2018-06-09 09:42:57 -04:00
/* Nope, wasn't %%EOF */
2018-08-14 14:00:31 -07:00
cli_dbgmsg ( " pdf_findobj: Failed to parse object objid (# objects found: %u) \n " , pdf - > nobjs ) ;
2018-06-09 09:42:57 -04:00
/* Skip past the "obj" thing, and continue. */
2019-03-05 21:15:41 -05:00
pdf - > offset = obj_end - pdf - > map ;
2018-12-03 12:40:13 -05:00
status = CL_EPARSE ;
2018-08-14 14:00:31 -07:00
goto done ;
2018-06-02 20:58:35 -04:00
}
2019-03-05 21:15:41 -05:00
/* Yup, Looks, like the file continues after %%EOF.
2018-06-02 20:58:35 -04:00
* Probably another revision . Keep parsing . . . */
2019-03-05 21:15:41 -05:00
objid_search_index + + ;
2024-03-20 16:02:21 -04:00
cli_dbgmsg ( " pdf_findobj: %%%%EOF detected before end of file, at offset: %zu \n " , ( size_t ) ( objid_search_index - pdf - > map ) ) ;
2018-06-02 20:58:35 -04:00
} else {
/* Failed parsing at the very beginning */
2018-08-14 14:00:31 -07:00
cli_dbgmsg ( " pdf_findobj: Failed to parse object objid (# objects found: %u) \n " , pdf - > nobjs ) ;
2018-06-09 09:42:57 -04:00
/* Probably not a real object. Skip past the "obj" thing, and continue. */
2019-03-05 21:15:41 -05:00
pdf - > offset = obj_end - pdf - > map ;
2018-12-03 12:40:13 -05:00
status = CL_EPARSE ;
2018-08-14 14:00:31 -07:00
goto done ;
2018-06-02 20:58:35 -04:00
}
/* Try again, with offset slightly adjusted */
2019-03-05 21:15:41 -05:00
if ( CL_SUCCESS ! = cli_strntol_wrap ( objid_search_index , ( size_t ) ( ( genid_search_index - 1 ) - objid_search_index ) , 0 , 10 , & temp_long ) ) {
2018-08-14 14:00:31 -07:00
cli_dbgmsg ( " pdf_findobj: Failed to parse object objid (# objects found: %u) \n " , pdf - > nobjs ) ;
2018-06-09 09:42:57 -04:00
/* Still failed... Probably not a real object. Skip past the "obj" thing, and continue. */
2019-03-05 21:15:41 -05:00
pdf - > offset = obj_end - pdf - > map ;
2018-12-03 12:40:13 -05:00
status = CL_EPARSE ;
2018-08-14 14:00:31 -07:00
goto done ;
2019-01-22 14:15:46 -05:00
} else if ( temp_long < 0 ) {
cli_dbgmsg ( " pdf_findobj: Encountered invalid negative objid (%ld). \n " , temp_long ) ;
2019-03-05 21:15:41 -05:00
pdf - > offset = obj_end - pdf - > map ;
2019-01-22 14:15:46 -05:00
status = CL_EPARSE ;
goto done ;
2018-06-02 20:58:35 -04:00
}
2019-01-22 14:15:46 -05:00
2018-08-14 14:00:31 -07:00
cli_dbgmsg ( " pdf_findobj: There appears to be an additional revision. Continuing to parse... \n " ) ;
2019-01-22 14:15:46 -05:00
} else if ( temp_long < 0 ) {
cli_dbgmsg ( " pdf_findobj: Encountered invalid negative objid (%ld). \n " , temp_long ) ;
2019-03-05 21:15:41 -05:00
pdf - > offset = obj_end - pdf - > map ;
2019-01-22 14:15:46 -05:00
status = CL_EPARSE ;
goto done ;
2018-06-02 20:58:35 -04:00
}
2019-01-22 14:15:46 -05:00
objid = ( unsigned long ) temp_long ;
2018-08-14 14:00:31 -07:00
2018-12-03 12:40:13 -05:00
obj - > id = ( objid < < 8 ) | ( genid & 0xff ) ;
2019-03-05 21:15:41 -05:00
obj - > start = obj_end - pdf - > map ; /* obj start begins just after the "obj" string */
2010-05-10 11:57:44 +03:00
obj - > flags = 0 ;
2018-08-14 14:00:31 -07:00
2019-03-05 21:15:41 -05:00
/*
* We now have the objid , genid , and object start .
* Find the object end ( " endobj " ) .
*/
/* `- 1` accounts for size of white space before obj */
endobj_begin = cli_memstr ( obj_end , pdf - > map + pdf - > size - obj_end , " endobj " , strlen ( " endobj " ) ) ;
if ( NULL = = endobj_begin ) {
/* No end to object.
* PDF appears to be malformed or truncated .
* Will record the object size as going ot the end of the file .
* Will record that the object is truncated .
* Will position the pdf offset to the end of the PDF .
* The next iteration of this function will find no more objects . */
obj - > flags | = 1 < < OBJ_TRUNCATED ;
obj - > size = ( pdf - > map + pdf - > size ) - obj_end ;
pdf - > offset = pdf - > size ;
/* Truncated "object" found! */
status = CL_SUCCESS ;
goto done ;
2010-05-10 11:57:44 +03:00
}
2019-03-05 21:15:41 -05:00
endobj_end = endobj_begin + strlen ( " endobj " ) ;
2014-04-07 16:39:54 -04:00
2019-03-05 21:15:41 -05:00
/* Size of the object goes from "obj" <-> "endobject". */
obj - > size = endobj_begin - obj_end ;
pdf - > offset = endobj_end - pdf - > map ;
2014-04-07 16:39:54 -04:00
2019-03-05 21:15:41 -05:00
/*
* Object found !
*/
2018-08-14 14:00:31 -07:00
status = CL_SUCCESS ; /* truncated file, no end to obj. */
done :
if ( status = = CL_SUCCESS ) {
2019-03-05 21:15:41 -05:00
cli_dbgmsg ( " pdf_findobj: found %d %d obj @%lld, size: %zu bytes. \n " , obj - > id > > 8 , obj - > id & 0xff , ( long long ) ( obj - > start + pdf - > startoff ) , obj - > size ) ;
2018-12-03 12:40:13 -05:00
} else {
2018-08-14 14:00:31 -07:00
/* Remove the unused obj reference from our list of objects found */
/* No need to realloc pdf->objs back down. It won't leak. */
2018-12-03 12:40:13 -05:00
pdf - > objs [ pdf - > nobjs - 1 ] = NULL ;
2018-08-14 14:00:31 -07:00
pdf - > nobjs - - ;
/* Free up the obj struct. */
if ( NULL ! = obj )
free ( obj ) ;
2019-03-05 21:15:41 -05:00
if ( status = = CL_BREAK ) {
cli_dbgmsg ( " pdf_findobj: No more objects (# objects found: %u) \n " , pdf - > nobjs ) ;
} else if ( status = = CL_EMEM ) {
cli_warnmsg ( " pdf_findobj: Error allocating memory (# objects found: %u) \n " , pdf - > nobjs ) ;
} else {
cli_dbgmsg ( " pdf_findobj: Unexpected status code %d. \n " , status ) ;
}
2018-08-14 14:00:31 -07:00
}
2018-12-03 12:40:13 -05:00
return status ;
2010-05-10 11:57:44 +03:00
}
2017-08-16 17:31:45 -04:00
static size_t filter_writen ( struct pdf_struct * pdf , struct pdf_obj * obj , int fout , const char * buf , size_t len , size_t * sum )
2010-05-11 10:37:10 +03:00
{
2014-07-10 18:11:49 -04:00
UNUSEDPARAM ( obj ) ;
Windows: Fix C/Rust FFI compat issue + Windows compile warnings
Primarily this commit fixes an issue with the size of the parameters
passed to cli_checklimits(). The parameters were "unsigned long", which
varies in size depending on platform.
I've switched them to uint64_t / u64.
While working on this, I observed some concerning warnigns on Windows,
and some less serious ones, primarily regarding inconsistencies with
`const` parameters.
Finally, in `scanmem.c`, there is a warning regarding use of `wchar_t *`
with `GetModuleFileNameEx()` instead of `GetModuleFileNameExW()`.
This made me realize this code assumes we're not defining `UNICODE`,
which would have such macros use the 'A' variant.
I have fixed it the best I can, although I'm still a little
uncomfortable with some of this code that uses `char` or `wchar_t`
instead of TCHAR.
I also remove the `if (GetModuleFileNameEx) {` conditional, because this
macro/function will always be defined. The original code was checking a
function pointer, and so this was a bug when integrating into ClamAV.
Regarding the changes to `rijndael.c`, I found that this module assumes
`unsigned long` == 32bits. It does not.
I have corrected it to use `uint32_t`.
2024-03-20 12:21:40 -04:00
if ( cli_checklimits ( " pdf " , pdf - > ctx , ( uint64_t ) * sum , 0 , 0 ) )
return len ;
2014-04-07 16:39:54 -04:00
2010-05-11 10:37:10 +03:00
* sum + = len ;
2014-04-07 16:39:54 -04:00
2019-05-04 15:54:54 -04:00
return cli_writen ( fout , buf , len ) ;
2010-05-11 10:37:10 +03:00
}
2016-03-31 12:29:16 -04:00
void pdfobj_flag ( struct pdf_struct * pdf , struct pdf_obj * obj , enum pdf_flag flag )
2010-07-30 14:23:10 +03:00
{
2018-12-03 12:40:13 -05:00
const char * s = " " ;
2010-07-30 14:23:10 +03:00
pdf - > flags | = 1 < < flag ;
if ( ! cli_debug_flag )
2014-04-07 16:39:54 -04:00
return ;
2010-07-30 14:23:10 +03:00
switch ( flag ) {
2018-12-03 12:40:13 -05:00
case UNTERMINATED_OBJ_DICT :
s = " dictionary not terminated " ;
break ;
case ESCAPED_COMMON_PDFNAME :
/* like /JavaScript */
s = " escaped common pdfname " ;
break ;
case BAD_STREAM_FILTERS :
s = " duplicate stream filters " ;
break ;
case BAD_PDF_VERSION :
s = " bad pdf version " ;
break ;
case BAD_PDF_HEADERPOS :
s = " bad pdf header position " ;
break ;
case BAD_PDF_TRAILER :
s = " bad pdf trailer " ;
break ;
case BAD_PDF_TOOMANYOBJS :
s = " too many pdf objs " ;
break ;
case BAD_FLATE :
s = " bad deflate stream " ;
break ;
case BAD_FLATESTART :
s = " bad deflate stream start " ;
break ;
case BAD_STREAMSTART :
s = " bad stream start " ;
break ;
case UNKNOWN_FILTER :
s = " unknown filter used " ;
break ;
case BAD_ASCIIDECODE :
s = " bad ASCII decode " ;
break ;
case HEX_JAVASCRIPT :
s = " hex javascript " ;
break ;
case BAD_INDOBJ :
s = " referencing nonexistent obj " ;
break ;
case HAS_OPENACTION :
s = " has /OpenAction " ;
break ;
case HAS_LAUNCHACTION :
s = " has /LaunchAction " ;
break ;
case BAD_STREAMLEN :
s = " bad /Length, too small " ;
break ;
case ENCRYPTED_PDF :
s = " PDF is encrypted " ;
break ;
case LINEARIZED_PDF :
s = " linearized PDF " ;
break ;
case MANY_FILTERS :
s = " more than 2 filters per obj " ;
break ;
case DECRYPTABLE_PDF :
s = " decryptable PDF " ;
break ;
}
cli_dbgmsg ( " pdfobj_flag: %s flagged in object %u %u \n " , s , obj - > id > > 8 , obj - > id & 0xff ) ;
2010-07-30 14:23:10 +03:00
}
2014-06-25 13:36:30 -04:00
struct pdf_obj * find_obj ( struct pdf_struct * pdf , struct pdf_obj * obj , uint32_t objid )
2010-05-11 10:37:10 +03:00
{
2014-06-10 13:21:31 -04:00
uint32_t j ;
uint32_t i ;
2010-05-11 13:33:07 +03:00
/* search starting at previous obj (if exists) */
2018-08-14 14:00:31 -07:00
for ( i = 0 ; i < pdf - > nobjs ; i + + ) {
if ( pdf - > objs [ i ] = = obj )
break ;
}
2014-04-07 16:39:54 -04:00
2018-08-14 14:00:31 -07:00
for ( j = i ; j < pdf - > nobjs ; j + + ) {
obj = pdf - > objs [ j ] ;
2014-04-07 16:39:54 -04:00
if ( obj - > id = = objid )
return obj ;
2010-05-11 10:37:10 +03:00
}
2014-04-07 16:39:54 -04:00
2010-05-11 10:37:10 +03:00
/* restart search from beginning if not found */
2018-08-14 14:00:31 -07:00
for ( j = 0 ; j < i ; j + + ) {
obj = pdf - > objs [ j ] ;
2014-04-07 16:39:54 -04:00
if ( obj - > id = = objid )
return obj ;
2010-05-11 10:37:10 +03:00
}
2014-04-07 16:39:54 -04:00
2010-05-11 10:37:10 +03:00
return NULL ;
}
2018-08-14 14:00:31 -07:00
/**
* @ brief Find and interpret the " /Length " dictionary key value .
2019-03-05 21:15:41 -05:00
*
2018-08-14 14:00:31 -07:00
* The value may be :
2019-03-05 21:15:41 -05:00
* - a direct object ( i . e . just a number )
2018-08-14 14:00:31 -07:00
* - an indirect object , where the value is somewhere else in the document and we have to look it up .
* indirect objects are referenced using an object id ( objid ) , generation id ( genid ) genid , and the letter ' R ' .
2019-03-05 21:15:41 -05:00
*
2018-08-14 14:00:31 -07:00
* Example dictionary with a single key " /Length " that relies direct object for the value .
2019-03-05 21:15:41 -05:00
*
2018-08-14 14:00:31 -07:00
* 1 0 obj
* < < / Length 534
* / Filter [ / ASCII85Decode / LZWDecode ]
* > >
* stream
* J . . ) 6 T ` ? p & < ! J9 % _ [ umg " B7/Z7KNXbN'S+,*Q/& " OLT ' FLIDK # ! n ` $ " <Atdi` \ Vn%b%)&'cA*VnK \ CJY(sF>c!Jnl@
* RM ] WM ; jjH6Gnc75idkL5 ] + cPZKEBPWdR > FF ( kj1_R % W_d & / jS ! ; iuad7h ? [ L - F $ + ] ] 0 A3Ck * $ I0KZ ? ; < ) CJtqi65Xb
* Vc3 \ n5ua : Q / = 0 $ W < # N3U ; H , MQKqfg1 ? : lUpR ; 6 oN [ C2E4ZNr8Udn . ' p + ? # X + 1 > 0 Kuk $ bCDF / ( 3f L5 ] Oq ) ^ kJZ ! C2H1
* ' TO ] Rl ? Q : & ' < 5 & iP ! $ Rq ; BXRecDN [ IJB ` , ) o8XJOSJ9sDS ] hQ ; Rj @ ! ND ) bD_q & C \ g : inYC % ) & u # : u , M6Bm % IY ! Kb1 +
* " :aAa'S`ViJglLb8<W9k6Yl \\ 0McJQkDeLWdPN?9A'jX*al>iG1p&i;eVoK&juJHs9%;Xomop " 5 KatWRT " JQ#qYuL,
* JD ? M $ 0 QP ) lKn06l1apKDC @ \ qJ4B ! ! ( 5 m + j .7F 790 m ( Vj88l8Q : _CZ ( Gm1 % X \ N1 & u ! FKHMB ~ >
* endstream
* endobj
2019-03-05 21:15:41 -05:00
*
2018-08-14 14:00:31 -07:00
* Example dictionary with a single key " /Length " that relies on an indirect object for the value .
2019-03-05 21:15:41 -05:00
*
2018-08-14 14:00:31 -07:00
* 7 0 obj
* < < / Length 8 0 R > > % An indirect reference to object 8 , with generation id 0.
* stream
* BT
* / F1 12 Tf
* 72 712 Td
* ( A stream with an indirect length ) Tj
* ET
* endstream
* endobj
2019-03-05 21:15:41 -05:00
*
2018-08-14 14:00:31 -07:00
* 8 0 obj
* 77 % The length of the preceding stream
* endobj
2019-03-05 21:15:41 -05:00
*
2018-08-14 14:00:31 -07:00
* @ param pdf Pdf context structure .
* @ param obj Pdf object context structure .
* @ param start Pointer start of the dictionary string .
* @ param len Remaining length of the dictioary string in bytes .
* @ return size_t Unsigned integer value of the " /Length " key
*/
static size_t find_length ( struct pdf_struct * pdf , struct pdf_obj * obj , const char * dict_start , size_t dict_len )
2010-05-11 10:37:10 +03:00
{
2019-01-22 14:15:46 -05:00
size_t length = 0 ;
const char * obj_start = dict_start ;
size_t bytes_remaining = dict_len ;
long temp_long = 0 ;
2018-08-14 14:00:31 -07:00
const char * index ;
2014-04-07 16:39:54 -04:00
2018-08-14 14:00:31 -07:00
if ( bytes_remaining < 8 ) {
2014-04-07 16:39:54 -04:00
return 0 ;
2018-08-14 14:00:31 -07:00
}
2014-04-07 16:39:54 -04:00
2018-08-14 14:00:31 -07:00
/*
* Find the " /Length " dictionary key
*/
index = cli_memstr ( obj_start , bytes_remaining , " /Length " , 7 ) ;
if ( ! index )
2014-04-07 16:39:54 -04:00
return 0 ;
2023-04-19 16:10:16 -07:00
bytes_remaining - = index - obj_start ;
2018-08-14 14:00:31 -07:00
if ( bytes_remaining < 1 ) {
2018-06-02 20:58:35 -04:00
return 0 ;
}
2018-08-14 14:00:31 -07:00
/* Step the index into the "/Length" string. */
index + + ;
2023-08-07 13:35:58 -07:00
bytes_remaining - - ;
2018-08-14 14:00:31 -07:00
/* Find the start of the next direct or indirect object.
* pdf_nextobject ( ) assumes we started searching from within a previous object */
obj_start = pdf_nextobject ( index , bytes_remaining ) ;
if ( ! obj_start )
return 0 ;
2019-03-05 21:15:41 -05:00
if ( bytes_remaining < ( size_t ) ( obj_start - index ) ) {
2018-08-14 14:00:31 -07:00
return 0 ;
}
bytes_remaining - = obj_start - index ;
index = obj_start ;
2018-12-03 12:40:13 -05:00
2018-08-14 14:00:31 -07:00
/* Read the value. This could either be the direct length value,
or the object id of the indirect object that has the length */
2019-01-22 14:15:46 -05:00
if ( CL_SUCCESS ! = cli_strntol_wrap ( index , bytes_remaining , 0 , 10 , & temp_long ) ) {
cli_dbgmsg ( " find_length: failed to parse object length or objid \n " ) ;
return 0 ;
} else if ( temp_long < 0 ) {
cli_dbgmsg ( " find_length: Encountered invalid negative object length or objid (%ld). \n " , temp_long ) ;
2018-08-14 14:00:31 -07:00
return 0 ;
}
2019-01-22 14:15:46 -05:00
length = ( size_t ) temp_long ; /* length or maybe object id */
2018-08-14 14:00:31 -07:00
2019-03-05 21:15:41 -05:00
/*
* Keep parsing , skipping past the first integer that might have been what we wanted .
* If it ' s an indirect object , we ' ll find a Generation ID followed by the letter ' R '
* I . e . something like " 0 R "
2018-08-14 14:00:31 -07:00
*/
while ( ( bytes_remaining > 0 ) & & isdigit ( * index ) ) {
index + + ;
bytes_remaining - - ;
2018-06-01 14:23:25 -04:00
}
2014-04-07 16:39:54 -04:00
2018-08-14 14:00:31 -07:00
if ( ( bytes_remaining > 0 ) & & ( * index = = ' ' ) ) {
2018-06-02 20:58:35 -04:00
unsigned long genid ;
2018-08-14 14:00:31 -07:00
index + + ;
bytes_remaining - - ;
2019-01-22 14:15:46 -05:00
if ( CL_SUCCESS ! = cli_strntol_wrap ( index , bytes_remaining , 0 , 10 , & temp_long ) ) {
2018-08-14 14:00:31 -07:00
cli_dbgmsg ( " find_length: failed to parse object genid \n " ) ;
2018-06-02 20:58:35 -04:00
return 0 ;
2019-01-22 14:15:46 -05:00
} else if ( temp_long < 0 ) {
cli_dbgmsg ( " find_length: Encountered invalid negative object genid (%ld). \n " , temp_long ) ;
return 0 ;
2018-06-02 20:58:35 -04:00
}
2019-01-22 14:15:46 -05:00
genid = ( unsigned long ) temp_long ;
2014-04-07 16:39:54 -04:00
2018-12-03 12:40:13 -05:00
while ( ( bytes_remaining > 0 ) & & isdigit ( * index ) ) {
2018-08-14 14:00:31 -07:00
index + + ;
bytes_remaining - - ;
}
if ( bytes_remaining < 2 ) {
return 0 ;
2018-06-01 14:23:25 -04:00
}
2014-04-07 16:39:54 -04:00
2018-08-14 14:00:31 -07:00
if ( index [ 0 ] = = ' ' & & index [ 1 ] = = ' R ' ) {
2019-03-05 21:15:41 -05:00
/*
* Ok so we found a genid and that ' R ' . Which means that first value
2018-08-14 14:00:31 -07:00
* was actually the objid .
* We can look up the indirect object using this information .
*/
2018-12-03 12:40:13 -05:00
unsigned long objid = length ;
const char * indirect_obj_start = NULL ;
2018-08-14 14:00:31 -07:00
cli_dbgmsg ( " find_length: length is in indirect object %lu %lu \n " , objid , genid ) ;
2014-04-07 16:39:54 -04:00
2018-12-03 12:40:13 -05:00
obj = find_obj ( pdf , obj , ( length < < 8 ) | ( genid & 0xff ) ) ;
2014-04-07 16:39:54 -04:00
if ( ! obj ) {
2018-08-14 14:00:31 -07:00
cli_dbgmsg ( " find_length: indirect object not found \n " ) ;
2014-04-07 16:39:54 -04:00
return 0 ;
}
2024-07-16 11:22:05 -04:00
if ( NULL = = obj - > objstm ) {
indirect_obj_start = ( const char * ) ( obj - > start + pdf - > map ) ;
if ( ! CLI_ISCONTAINED ( pdf - > map , pdf - > size , indirect_obj_start , obj - > size ) ) {
cli_dbgmsg ( " find_length: indirect object found, but not contained in PDF \n " ) ;
return 0 ;
}
bytes_remaining = pdf - > size - obj - > start ;
} else {
indirect_obj_start = ( const char * ) ( obj - > start + obj - > objstm - > streambuf ) ;
if ( ! CLI_ISCONTAINED ( obj - > objstm - > streambuf , obj - > objstm - > streambuf_len , indirect_obj_start , obj - > size ) ) {
cli_dbgmsg ( " find_length: indirect object found, but not contained in PDF streambuf \n " ) ;
return 0 ;
}
bytes_remaining = obj - > objstm - > streambuf_len - obj - > start ;
}
2018-12-03 12:40:13 -05:00
2018-08-14 14:00:31 -07:00
/* Ok so we found the indirect object, lets read the value. */
index = pdf_nextobject ( indirect_obj_start , bytes_remaining ) ;
if ( ! index ) {
cli_dbgmsg ( " find_length: next object not found \n " ) ;
2014-04-07 16:39:54 -04:00
return 0 ;
}
2018-12-03 12:40:13 -05:00
2019-03-05 21:15:41 -05:00
if ( bytes_remaining < ( size_t ) ( index - indirect_obj_start ) ) {
2018-08-14 14:00:31 -07:00
return 0 ;
}
bytes_remaining - = index - indirect_obj_start ;
2014-04-07 16:39:54 -04:00
2019-01-22 14:15:46 -05:00
/* Found the value, so lets parse it as a long, but prohibit negative lengths. */
if ( CL_SUCCESS ! = cli_strntol_wrap ( index , bytes_remaining , 0 , 10 , & temp_long ) ) {
2018-08-14 14:00:31 -07:00
cli_dbgmsg ( " find_length: failed to parse object length from indirect object \n " ) ;
2018-06-02 20:58:35 -04:00
return 0 ;
2019-01-22 14:15:46 -05:00
} else if ( temp_long < 0 ) {
cli_dbgmsg ( " find_length: Encountered invalid negative obj length (%ld). \n " , temp_long ) ;
return 0 ;
2018-06-02 20:58:35 -04:00
}
2019-01-22 14:15:46 -05:00
length = ( size_t ) temp_long ;
2014-04-07 16:39:54 -04:00
}
2010-05-11 10:37:10 +03:00
}
2014-04-07 16:39:54 -04:00
2010-05-11 13:22:45 +03:00
/* limit length */
2019-03-05 21:15:41 -05:00
if ( ( size_t ) ( obj_start - pdf - > map ) + length + 5 > pdf - > size )
2018-08-14 14:00:31 -07:00
length = pdf - > size - ( obj_start - pdf - > map ) - 5 ;
2014-04-07 16:39:54 -04:00
2010-05-11 10:37:10 +03:00
return length ;
}
2011-12-23 17:40:22 +02:00
# define DUMP_MASK ((1 << OBJ_CONTENTS) | (1 << OBJ_FILTER_FLATE) | (1 << OBJ_FILTER_DCT) | (1 << OBJ_FILTER_AH) | (1 << OBJ_FILTER_A85) | (1 << OBJ_EMBEDDED_FILE) | (1 << OBJ_JAVASCRIPT) | (1 << OBJ_OPENACTION) | (1 << OBJ_LAUNCHACTION))
2010-05-11 11:26:35 +03:00
2025-06-08 01:12:33 -04:00
static int run_pdf_hooks ( struct pdf_struct * pdf , enum pdf_phase phase , int fd , const char * filepath )
2010-08-02 15:42:58 +03:00
{
int ret ;
struct cli_bc_ctx * bc_ctx ;
2023-04-19 16:10:16 -07:00
cli_ctx * ctx = NULL ;
2010-08-02 15:42:58 +03:00
fmap_t * map ;
2023-04-19 16:10:16 -07:00
if ( NULL = = pdf )
return CL_EARG ;
ctx = pdf - > ctx ;
2010-08-02 15:42:58 +03:00
bc_ctx = cli_bytecode_context_alloc ( ) ;
if ( ! bc_ctx ) {
2018-08-14 14:00:31 -07:00
cli_errmsg ( " run_pdf_hooks: can't allocate memory for bc_ctx \n " ) ;
2014-04-07 16:39:54 -04:00
return CL_EMEM ;
2010-08-02 15:42:58 +03:00
}
libclamav: Fix scan recursion tracking
Scan recursion is the process of identifying files embedded in other
files and then scanning them, recursively.
Internally this process is more complex than it may sound because a file
may have multiple layers of types before finding a new "file".
At present we treat the recursion count in the scanning context as an
index into both our fmap list AND our container list. These two lists
are conceptually a part of the same thing and should be unified.
But what's concerning is that the "recursion level" isn't actually
incremented or decremented at the same time that we add a layer to the
fmap or container lists but instead is more touchy-feely, increasing
when we find a new "file".
To account for this shadiness, the size of the fmap and container lists
has always been a little longer than our "max scan recursion" limit so
we don't accidentally overflow the fmap or container arrays (!).
I've implemented a single recursion-stack as an array, similar to before,
which includes a pointer to each fmap at each layer, along with the size
and type. Push and pop functions add and remove layers whenever a new
fmap is added. A boolean argument when pushing indicates if the new layer
represents a new buffer or new file (descriptor). A new buffer will reset
the "nested fmap level" (described below).
This commit also provides a solution for an issue where we detect
embedded files more than once during scan recursion.
For illustration, imagine a tarball named foo.tar.gz with this structure:
| description | type | rec level | nested fmap level |
| ------------------------- | ----- | --------- | ----------------- |
| foo.tar.gz | GZ | 0 | 0 |
| └── foo.tar | TAR | 1 | 0 |
| ├── bar.zip | ZIP | 2 | 1 |
| │ └── hola.txt | ASCII | 3 | 0 |
| └── baz.exe | PE | 2 | 1 |
But suppose baz.exe embeds a ZIP archive and a 7Z archive, like this:
| description | type | rec level | nested fmap level |
| ------------------------- | ----- | --------- | ----------------- |
| baz.exe | PE | 0 | 0 |
| ├── sfx.zip | ZIP | 1 | 1 |
| │ └── hello.txt | ASCII | 2 | 0 |
| └── sfx.7z | 7Z | 1 | 1 |
| └── world.txt | ASCII | 2 | 0 |
(A) If we scan for embedded files at any layer, we may detect:
| description | type | rec level | nested fmap level |
| ------------------------- | ----- | --------- | ----------------- |
| foo.tar.gz | GZ | 0 | 0 |
| ├── foo.tar | TAR | 1 | 0 |
| │ ├── bar.zip | ZIP | 2 | 1 |
| │ │ └── hola.txt | ASCII | 3 | 0 |
| │ ├── baz.exe | PE | 2 | 1 |
| │ │ ├── sfx.zip | ZIP | 3 | 1 |
| │ │ │ └── hello.txt | ASCII | 4 | 0 |
| │ │ └── sfx.7z | 7Z | 3 | 1 |
| │ │ └── world.txt | ASCII | 4 | 0 |
| │ ├── sfx.zip | ZIP | 2 | 1 |
| │ │ └── hello.txt | ASCII | 3 | 0 |
| │ └── sfx.7z | 7Z | 2 | 1 |
| │ └── world.txt | ASCII | 3 | 0 |
| ├── sfx.zip | ZIP | 1 | 1 |
| └── sfx.7z | 7Z | 1 | 1 |
(A) is bad because it scans content more than once.
Note that for the GZ layer, it may detect the ZIP and 7Z if the
signature hits on the compressed data, which it might, though
extracting the ZIP and 7Z will likely fail.
The reason the above doesn't happen now is that we restrict embedded
type scans for a bunch of archive formats to include GZ and TAR.
(B) If we scan for embedded files at the foo.tar layer, we may detect:
| description | type | rec level | nested fmap level |
| ------------------------- | ----- | --------- | ----------------- |
| foo.tar.gz | GZ | 0 | 0 |
| └── foo.tar | TAR | 1 | 0 |
| ├── bar.zip | ZIP | 2 | 1 |
| │ └── hola.txt | ASCII | 3 | 0 |
| ├── baz.exe | PE | 2 | 1 |
| ├── sfx.zip | ZIP | 2 | 1 |
| │ └── hello.txt | ASCII | 3 | 0 |
| └── sfx.7z | 7Z | 2 | 1 |
| └── world.txt | ASCII | 3 | 0 |
(B) is almost right. But we can achieve it easily enough only scanning for
embedded content in the current fmap when the "nested fmap level" is 0.
The upside is that it should safely detect all embedded content, even if
it may think the sfz.zip and sfx.7z are in foo.tar instead of in baz.exe.
The biggest risk I can think of affects ZIPs. SFXZIP detection
is identical to ZIP detection, which is why we don't allow SFXZIP to be
detected if insize of a ZIP. If we only allow embedded type scanning at
fmap-layer 0 in each buffer, this will fail to detect the embedded ZIP
if the bar.exe was not compressed in foo.zip and if non-compressed files
extracted from ZIPs aren't extracted as new buffers:
| description | type | rec level | nested fmap level |
| ------------------------- | ----- | --------- | ----------------- |
| foo.zip | ZIP | 0 | 0 |
| └── bar.exe | PE | 1 | 1 |
| └── sfx.zip | ZIP | 2 | 2 |
Provided that we ensure all files extracted from zips are scanned in
new buffers, option (B) should be safe.
(C) If we scan for embedded files at the baz.exe layer, we may detect:
| description | type | rec level | nested fmap level |
| ------------------------- | ----- | --------- | ----------------- |
| foo.tar.gz | GZ | 0 | 0 |
| └── foo.tar | TAR | 1 | 0 |
| ├── bar.zip | ZIP | 2 | 1 |
| │ └── hola.txt | ASCII | 3 | 0 |
| └── baz.exe | PE | 2 | 1 |
| ├── sfx.zip | ZIP | 3 | 1 |
| │ └── hello.txt | ASCII | 4 | 0 |
| └── sfx.7z | 7Z | 3 | 1 |
| └── world.txt | ASCII | 4 | 0 |
(C) is right. But it's harder to achieve. For this example we can get it by
restricting 7ZSFX and ZIPSFX detection only when scanning an executable.
But that may mean losing detection of archives embedded elsewhere.
And we'd have to identify allowable container types for each possible
embedded type, which would be very difficult.
So this commit aims to solve the issue the (B)-way.
Note that in all situations, we still have to scan with file typing
enabled to determine if we need to reassign the current file type, such
as re-identifying a Bzip2 archive as a DMG that happens to be Bzip2-
compressed. Detection of DMG and a handful of other types rely on
finding data partway through or near the ned of a file before
reassigning the entire file as the new type.
Other fixes and considerations in this commit:
- The utf16 HTML parser has weak error handling, particularly with respect
to creating a nested fmap for scanning the ascii decoded file.
This commit cleans up the error handling and wraps the nested scan with
the recursion-stack push()/pop() for correct recursion tracking.
Before this commit, each container layer had a flag to indicate if the
container layer is valid.
We need something similar so that the cli_recursion_stack_get_*()
functions ignore normalized layers. Details...
Imagine an LDB signature for HTML content that specifies a ZIP
container. If the signature actually alerts on the normalized HTML and
you don't ignore normalized layers for the container check, it will
appear as though the alert is in an HTML container rather than a ZIP
container.
This commit accomplishes this with a boolean you set in the scan context
before scanning a new layer. Then when the new fmap is created, it will
use that flag to set similar flag for the layer. The context flag is
reset those that anything after this doesn't have that flag.
The flag allows the new recursion_stack_get() function to ignore
normalized layers when iterating the stack to return a layer at a
requested index, negative or positive.
Scanning normalized extracted/normalized javascript and VBA should also
use the 'layer is normalized' flag.
- This commit also fixes Heuristic.Broken.Executable alert for ELF files
to make sure that:
A) these only alert if cli_append_virus() returns CL_VIRUS (aka it
respects the FP check).
B) all broken-executable alerts for ELF only happen if the
SCAN_HEURISTIC_BROKEN option is enabled.
- This commit also cleans up the error handling in cli_magic_scan_dir().
This was needed so we could correctly apply the layer-is-normalized-flag
to all VBA macros extracted to a directory when scanning the directory.
- Also fix an issue where exceeding scan maximums wouldn't cause embedded
file detection scans to abort. Granted we don't actually want to abort
if max filesize or max recursion depth are exceeded... only if max
scansize, max files, and max scantime are exceeded.
Add 'abort_scan' flag to scan context, to protect against depending on
correct error propagation for fatal conditions. Instead, setting this
flag in the scan context should guarantee that a fatal condition deep in
scan recursion isn't lost which result in more stuff being scanned
instead of aborting. This shouldn't be necessary, but some status codes
like CL_ETIMEOUT never used to be fatal and it's easier to do this than
to verify every parser only returns CL_ETIMEOUT and other "fatal
status codes" in fatal conditions.
- Remove duplicate is_tar() prototype from filestypes.c and include
is_tar.h instead.
- Presently we create the fmap hash when creating the fmap.
This wastes a bit of CPU if the hash is never needed.
Now that we're creating fmap's for all embedded files discovered with
file type recognition scans, this is a much more frequent occurence and
really slows things down.
This commit fixes the issue by only creating fmap hashes as needed.
This should not only resolve the perfomance impact of creating fmap's
for all embedded files, but also should improve performance in general.
- Add allmatch check to the zip parser after the central-header meta
match. That way we don't multiple alerts with the same match except in
allmatch mode. Clean up error handling in the zip parser a tiny bit.
- Fixes to ensure that the scan limits such as scansize, filesize,
recursion depth, # of embedded files, and scantime are always reported
if AlertExceedsMax (--alert-exceeds-max) is enabled.
- Fixed an issue where non-fatal alerts for exceeding scan maximums may
mask signature matches later on. I changed it so these alerts use the
"possibly unwanted" alert-type and thus only alert if no other alerts
were found or if all-match or heuristic-precedence are enabled.
- Added the "Heuristics.Limits.Exceeded.*" events to the JSON metadata
when the --gen-json feature is enabled. These will show up once under
"ParseErrors" the first time a limit is exceeded. In the present
implementation, only one limits-exceeded events will be added, so as to
prevent a malicious or malformed sample from filling the JSON buffer
with millions of events and using a tonne of RAM.
2021-09-11 14:15:21 -07:00
map = ctx - > fmap ;
2010-08-02 15:42:58 +03:00
if ( fd ! = - 1 ) {
2025-06-08 01:12:33 -04:00
/* The fmap in this bytecode context is an extracted pdf object. */
map = fmap_new ( fd , 0 , 0 , NULL , filepath ) ;
2014-04-07 16:39:54 -04:00
if ( ! map ) {
2018-08-14 14:00:31 -07:00
cli_dbgmsg ( " run_pdf_hooks: can't mmap pdf extracted obj \n " ) ;
libclamav: Fix scan recursion tracking
Scan recursion is the process of identifying files embedded in other
files and then scanning them, recursively.
Internally this process is more complex than it may sound because a file
may have multiple layers of types before finding a new "file".
At present we treat the recursion count in the scanning context as an
index into both our fmap list AND our container list. These two lists
are conceptually a part of the same thing and should be unified.
But what's concerning is that the "recursion level" isn't actually
incremented or decremented at the same time that we add a layer to the
fmap or container lists but instead is more touchy-feely, increasing
when we find a new "file".
To account for this shadiness, the size of the fmap and container lists
has always been a little longer than our "max scan recursion" limit so
we don't accidentally overflow the fmap or container arrays (!).
I've implemented a single recursion-stack as an array, similar to before,
which includes a pointer to each fmap at each layer, along with the size
and type. Push and pop functions add and remove layers whenever a new
fmap is added. A boolean argument when pushing indicates if the new layer
represents a new buffer or new file (descriptor). A new buffer will reset
the "nested fmap level" (described below).
This commit also provides a solution for an issue where we detect
embedded files more than once during scan recursion.
For illustration, imagine a tarball named foo.tar.gz with this structure:
| description | type | rec level | nested fmap level |
| ------------------------- | ----- | --------- | ----------------- |
| foo.tar.gz | GZ | 0 | 0 |
| └── foo.tar | TAR | 1 | 0 |
| ├── bar.zip | ZIP | 2 | 1 |
| │ └── hola.txt | ASCII | 3 | 0 |
| └── baz.exe | PE | 2 | 1 |
But suppose baz.exe embeds a ZIP archive and a 7Z archive, like this:
| description | type | rec level | nested fmap level |
| ------------------------- | ----- | --------- | ----------------- |
| baz.exe | PE | 0 | 0 |
| ├── sfx.zip | ZIP | 1 | 1 |
| │ └── hello.txt | ASCII | 2 | 0 |
| └── sfx.7z | 7Z | 1 | 1 |
| └── world.txt | ASCII | 2 | 0 |
(A) If we scan for embedded files at any layer, we may detect:
| description | type | rec level | nested fmap level |
| ------------------------- | ----- | --------- | ----------------- |
| foo.tar.gz | GZ | 0 | 0 |
| ├── foo.tar | TAR | 1 | 0 |
| │ ├── bar.zip | ZIP | 2 | 1 |
| │ │ └── hola.txt | ASCII | 3 | 0 |
| │ ├── baz.exe | PE | 2 | 1 |
| │ │ ├── sfx.zip | ZIP | 3 | 1 |
| │ │ │ └── hello.txt | ASCII | 4 | 0 |
| │ │ └── sfx.7z | 7Z | 3 | 1 |
| │ │ └── world.txt | ASCII | 4 | 0 |
| │ ├── sfx.zip | ZIP | 2 | 1 |
| │ │ └── hello.txt | ASCII | 3 | 0 |
| │ └── sfx.7z | 7Z | 2 | 1 |
| │ └── world.txt | ASCII | 3 | 0 |
| ├── sfx.zip | ZIP | 1 | 1 |
| └── sfx.7z | 7Z | 1 | 1 |
(A) is bad because it scans content more than once.
Note that for the GZ layer, it may detect the ZIP and 7Z if the
signature hits on the compressed data, which it might, though
extracting the ZIP and 7Z will likely fail.
The reason the above doesn't happen now is that we restrict embedded
type scans for a bunch of archive formats to include GZ and TAR.
(B) If we scan for embedded files at the foo.tar layer, we may detect:
| description | type | rec level | nested fmap level |
| ------------------------- | ----- | --------- | ----------------- |
| foo.tar.gz | GZ | 0 | 0 |
| └── foo.tar | TAR | 1 | 0 |
| ├── bar.zip | ZIP | 2 | 1 |
| │ └── hola.txt | ASCII | 3 | 0 |
| ├── baz.exe | PE | 2 | 1 |
| ├── sfx.zip | ZIP | 2 | 1 |
| │ └── hello.txt | ASCII | 3 | 0 |
| └── sfx.7z | 7Z | 2 | 1 |
| └── world.txt | ASCII | 3 | 0 |
(B) is almost right. But we can achieve it easily enough only scanning for
embedded content in the current fmap when the "nested fmap level" is 0.
The upside is that it should safely detect all embedded content, even if
it may think the sfz.zip and sfx.7z are in foo.tar instead of in baz.exe.
The biggest risk I can think of affects ZIPs. SFXZIP detection
is identical to ZIP detection, which is why we don't allow SFXZIP to be
detected if insize of a ZIP. If we only allow embedded type scanning at
fmap-layer 0 in each buffer, this will fail to detect the embedded ZIP
if the bar.exe was not compressed in foo.zip and if non-compressed files
extracted from ZIPs aren't extracted as new buffers:
| description | type | rec level | nested fmap level |
| ------------------------- | ----- | --------- | ----------------- |
| foo.zip | ZIP | 0 | 0 |
| └── bar.exe | PE | 1 | 1 |
| └── sfx.zip | ZIP | 2 | 2 |
Provided that we ensure all files extracted from zips are scanned in
new buffers, option (B) should be safe.
(C) If we scan for embedded files at the baz.exe layer, we may detect:
| description | type | rec level | nested fmap level |
| ------------------------- | ----- | --------- | ----------------- |
| foo.tar.gz | GZ | 0 | 0 |
| └── foo.tar | TAR | 1 | 0 |
| ├── bar.zip | ZIP | 2 | 1 |
| │ └── hola.txt | ASCII | 3 | 0 |
| └── baz.exe | PE | 2 | 1 |
| ├── sfx.zip | ZIP | 3 | 1 |
| │ └── hello.txt | ASCII | 4 | 0 |
| └── sfx.7z | 7Z | 3 | 1 |
| └── world.txt | ASCII | 4 | 0 |
(C) is right. But it's harder to achieve. For this example we can get it by
restricting 7ZSFX and ZIPSFX detection only when scanning an executable.
But that may mean losing detection of archives embedded elsewhere.
And we'd have to identify allowable container types for each possible
embedded type, which would be very difficult.
So this commit aims to solve the issue the (B)-way.
Note that in all situations, we still have to scan with file typing
enabled to determine if we need to reassign the current file type, such
as re-identifying a Bzip2 archive as a DMG that happens to be Bzip2-
compressed. Detection of DMG and a handful of other types rely on
finding data partway through or near the ned of a file before
reassigning the entire file as the new type.
Other fixes and considerations in this commit:
- The utf16 HTML parser has weak error handling, particularly with respect
to creating a nested fmap for scanning the ascii decoded file.
This commit cleans up the error handling and wraps the nested scan with
the recursion-stack push()/pop() for correct recursion tracking.
Before this commit, each container layer had a flag to indicate if the
container layer is valid.
We need something similar so that the cli_recursion_stack_get_*()
functions ignore normalized layers. Details...
Imagine an LDB signature for HTML content that specifies a ZIP
container. If the signature actually alerts on the normalized HTML and
you don't ignore normalized layers for the container check, it will
appear as though the alert is in an HTML container rather than a ZIP
container.
This commit accomplishes this with a boolean you set in the scan context
before scanning a new layer. Then when the new fmap is created, it will
use that flag to set similar flag for the layer. The context flag is
reset those that anything after this doesn't have that flag.
The flag allows the new recursion_stack_get() function to ignore
normalized layers when iterating the stack to return a layer at a
requested index, negative or positive.
Scanning normalized extracted/normalized javascript and VBA should also
use the 'layer is normalized' flag.
- This commit also fixes Heuristic.Broken.Executable alert for ELF files
to make sure that:
A) these only alert if cli_append_virus() returns CL_VIRUS (aka it
respects the FP check).
B) all broken-executable alerts for ELF only happen if the
SCAN_HEURISTIC_BROKEN option is enabled.
- This commit also cleans up the error handling in cli_magic_scan_dir().
This was needed so we could correctly apply the layer-is-normalized-flag
to all VBA macros extracted to a directory when scanning the directory.
- Also fix an issue where exceeding scan maximums wouldn't cause embedded
file detection scans to abort. Granted we don't actually want to abort
if max filesize or max recursion depth are exceeded... only if max
scansize, max files, and max scantime are exceeded.
Add 'abort_scan' flag to scan context, to protect against depending on
correct error propagation for fatal conditions. Instead, setting this
flag in the scan context should guarantee that a fatal condition deep in
scan recursion isn't lost which result in more stuff being scanned
instead of aborting. This shouldn't be necessary, but some status codes
like CL_ETIMEOUT never used to be fatal and it's easier to do this than
to verify every parser only returns CL_ETIMEOUT and other "fatal
status codes" in fatal conditions.
- Remove duplicate is_tar() prototype from filestypes.c and include
is_tar.h instead.
- Presently we create the fmap hash when creating the fmap.
This wastes a bit of CPU if the hash is never needed.
Now that we're creating fmap's for all embedded files discovered with
file type recognition scans, this is a much more frequent occurence and
really slows things down.
This commit fixes the issue by only creating fmap hashes as needed.
This should not only resolve the perfomance impact of creating fmap's
for all embedded files, but also should improve performance in general.
- Add allmatch check to the zip parser after the central-header meta
match. That way we don't multiple alerts with the same match except in
allmatch mode. Clean up error handling in the zip parser a tiny bit.
- Fixes to ensure that the scan limits such as scansize, filesize,
recursion depth, # of embedded files, and scantime are always reported
if AlertExceedsMax (--alert-exceeds-max) is enabled.
- Fixed an issue where non-fatal alerts for exceeding scan maximums may
mask signature matches later on. I changed it so these alerts use the
"possibly unwanted" alert-type and thus only alert if no other alerts
were found or if all-match or heuristic-precedence are enabled.
- Added the "Heuristics.Limits.Exceeded.*" events to the JSON metadata
when the --gen-json feature is enabled. These will show up once under
"ParseErrors" the first time a limit is exceeded. In the present
implementation, only one limits-exceeded events will be added, so as to
prevent a malicious or malformed sample from filling the JSON buffer
with millions of events and using a tonne of RAM.
2021-09-11 14:15:21 -07:00
map = ctx - > fmap ;
2018-12-03 12:40:13 -05:00
fd = - 1 ;
2014-04-07 16:39:54 -04:00
}
2010-08-02 15:42:58 +03:00
}
2014-04-07 16:39:54 -04:00
cli_bytecode_context_setpdf ( bc_ctx , phase , pdf - > nobjs , pdf - > objs , & pdf - > flags , pdf - > size , pdf - > startoff ) ;
2010-08-02 15:42:58 +03:00
cli_bytecode_context_setctx ( bc_ctx , ctx ) ;
2012-10-18 14:12:58 -07:00
ret = cli_bytecode_runhook ( ctx , ctx - > engine , bc_ctx , BC_PDF , map ) ;
2010-08-02 15:42:58 +03:00
cli_bytecode_context_destroy ( bc_ctx ) ;
2014-04-07 16:39:54 -04:00
if ( fd ! = - 1 )
2025-06-08 01:12:33 -04:00
fmap_free ( map ) ;
2014-04-07 16:39:54 -04:00
2010-08-02 15:42:58 +03:00
return ret ;
}
2011-12-15 13:27:31 +02:00
static void dbg_printhex ( const char * msg , const char * hex , unsigned len ) ;
2014-07-10 18:11:49 -04:00
2016-12-17 12:47:54 +01:00
static void aes_256cbc_decrypt ( const unsigned char * in , size_t * length , unsigned char * q , char * key , unsigned key_n , int has_iv )
2011-12-14 15:43:14 +02:00
{
Windows: Fix C/Rust FFI compat issue + Windows compile warnings
Primarily this commit fixes an issue with the size of the parameters
passed to cli_checklimits(). The parameters were "unsigned long", which
varies in size depending on platform.
I've switched them to uint64_t / u64.
While working on this, I observed some concerning warnigns on Windows,
and some less serious ones, primarily regarding inconsistencies with
`const` parameters.
Finally, in `scanmem.c`, there is a warning regarding use of `wchar_t *`
with `GetModuleFileNameEx()` instead of `GetModuleFileNameExW()`.
This made me realize this code assumes we're not defining `UNICODE`,
which would have such macros use the 'A' variant.
I have fixed it the best I can, although I'm still a little
uncomfortable with some of this code that uses `char` or `wchar_t`
instead of TCHAR.
I also remove the `if (GetModuleFileNameEx) {` conditional, because this
macro/function will always be defined. The original code was checking a
function pointer, and so this was a bug when integrating into ClamAV.
Regarding the changes to `rijndael.c`, I found that this module assumes
`unsigned long` == 32bits. It does not.
I have corrected it to use `uint32_t`.
2024-03-20 12:21:40 -04:00
uint32_t rk [ RKLENGTH ( 256 ) ] ;
2011-12-14 15:43:14 +02:00
unsigned char iv [ 16 ] ;
2020-07-30 22:52:27 -07:00
size_t len = 0 ;
2011-12-14 15:43:14 +02:00
unsigned char pad , i ;
2011-12-18 10:05:06 +02:00
int nrounds ;
2011-12-14 15:43:14 +02:00
2020-07-30 22:52:27 -07:00
if ( in = = NULL | | length = = NULL ) {
cli_dbgmsg ( " aes_256cbc_decrypt: invalid NULL parameters! \n " ) ;
noisy_warnmsg ( " aes_256cbc_decrypt: invalid NULL parameters! \n " ) ;
return ;
}
len = * length ;
cli_dbgmsg ( " aes_256cbc_decrypt: key length: %d, data length: %zu \n " , key_n , * length ) ;
2020-05-12 17:25:00 -07:00
if ( ! ( key_n = = 16 | | key_n = = 24 | | key_n = = 32 ) ) {
2020-07-30 22:52:27 -07:00
cli_dbgmsg ( " aes_256cbc_decrypt: invalid key length: %u! \n " , key_n * 8 ) ;
noisy_warnmsg ( " aes_256cbc_decrypt: invalid key length: %u! \n " , key_n * 8 ) ;
2014-04-07 16:39:54 -04:00
return ;
2011-12-14 15:43:14 +02:00
}
2014-04-07 16:39:54 -04:00
2011-12-14 15:43:14 +02:00
if ( len < 32 ) {
2016-12-17 12:47:54 +01:00
cli_dbgmsg ( " aes_256cbc_decrypt: len is <32: %zu \n " , len ) ;
noisy_warnmsg ( " aes_256cbc_decrypt: len is <32: %zu \n " , len ) ;
2014-04-07 16:39:54 -04:00
return ;
2011-12-14 15:43:14 +02:00
}
2014-04-07 16:39:54 -04:00
2011-12-15 13:27:31 +02:00
if ( has_iv ) {
2014-04-07 16:39:54 -04:00
memcpy ( iv , in , 16 ) ;
in + = 16 ;
len - = 16 ;
} else {
memset ( iv , 0 , sizeof ( iv ) ) ;
}
2011-12-14 15:43:14 +02:00
2016-12-17 12:47:54 +01:00
cli_dbgmsg ( " aes_256cbc_decrypt: Calling rijndaelSetupDecrypt \n " ) ;
2018-12-03 12:40:13 -05:00
nrounds = rijndaelSetupDecrypt ( rk , ( const unsigned char * ) key , key_n * 8 ) ;
2016-01-14 15:31:02 -05:00
if ( ! nrounds ) {
2019-08-23 12:57:18 -04:00
cli_dbgmsg ( " aes_256cbc_decrypt: nrounds = 0 \n " ) ;
2018-12-03 12:40:13 -05:00
return ;
2016-01-14 15:31:02 -05:00
}
2016-12-17 12:47:54 +01:00
cli_dbgmsg ( " aes_256cbc_decrypt: Beginning rijndaelDecrypt \n " ) ;
2014-04-07 16:39:54 -04:00
2011-12-14 15:43:14 +02:00
while ( len > = 16 ) {
2014-04-07 16:39:54 -04:00
unsigned i ;
rijndaelDecrypt ( rk , nrounds , in , q ) ;
2018-12-03 12:40:13 -05:00
for ( i = 0 ; i < 16 ; i + + )
2014-04-07 16:39:54 -04:00
q [ i ] ^ = iv [ i ] ;
memcpy ( iv , in , 16 ) ;
q + = 16 ;
in + = 16 ;
len - = 16 ;
2011-12-14 15:43:14 +02:00
}
2011-12-15 13:27:31 +02:00
if ( has_iv ) {
2014-04-07 16:39:54 -04:00
len + = 16 ;
pad = q [ - 1 ] ;
if ( pad > 0x10 ) {
2019-08-23 12:57:18 -04:00
cli_dbgmsg ( " aes_256cbc_decrypt: bad pad: %x (extra len: %zu) \n " , pad , len - 16 ) ;
noisy_warnmsg ( " aes_256cbc_decrypt: bad pad: %x (extra len: %zu) \n " , pad , len - 16 ) ;
2014-04-07 16:39:54 -04:00
* length - = len ;
return ;
}
q - = pad ;
2018-12-03 12:40:13 -05:00
for ( i = 1 ; i < pad ; i + + ) {
2014-04-07 16:39:54 -04:00
if ( q [ i ] ! = pad ) {
2019-08-23 12:57:18 -04:00
cli_dbgmsg ( " aes_256cbc_decrypt: bad pad: %x != %x \n " , q [ i ] , pad ) ;
noisy_warnmsg ( " aes_256cbc_decrypt: bad pad: %x != %x \n " , q [ i ] , pad ) ;
2014-04-07 16:39:54 -04:00
* length - = len ;
return ;
}
}
len + = pad ;
2011-12-14 15:43:14 +02:00
}
2014-04-07 16:39:54 -04:00
2011-12-14 15:43:14 +02:00
* length - = len ;
2014-04-07 16:39:54 -04:00
2019-08-23 12:57:18 -04:00
cli_dbgmsg ( " aes_256cbc_decrypt: length is %zu \n " , * length ) ;
2016-12-17 12:47:54 +01:00
}
2019-08-23 12:57:18 -04:00
static void aes_128cbc_encrypt ( const unsigned char * in , size_t in_length , unsigned char * out , size_t * out_length , const unsigned char * key , size_t key_n , const unsigned char * iv )
2016-12-17 12:47:54 +01:00
{
Windows: Fix C/Rust FFI compat issue + Windows compile warnings
Primarily this commit fixes an issue with the size of the parameters
passed to cli_checklimits(). The parameters were "unsigned long", which
varies in size depending on platform.
I've switched them to uint64_t / u64.
While working on this, I observed some concerning warnigns on Windows,
and some less serious ones, primarily regarding inconsistencies with
`const` parameters.
Finally, in `scanmem.c`, there is a warning regarding use of `wchar_t *`
with `GetModuleFileNameEx()` instead of `GetModuleFileNameExW()`.
This made me realize this code assumes we're not defining `UNICODE`,
which would have such macros use the 'A' variant.
I have fixed it the best I can, although I'm still a little
uncomfortable with some of this code that uses `char` or `wchar_t`
instead of TCHAR.
I also remove the `if (GetModuleFileNameEx) {` conditional, because this
macro/function will always be defined. The original code was checking a
function pointer, and so this was a bug when integrating into ClamAV.
Regarding the changes to `rijndael.c`, I found that this module assumes
`unsigned long` == 32bits. It does not.
I have corrected it to use `uint32_t`.
2024-03-20 12:21:40 -04:00
uint32_t rk [ RKLENGTH ( 128 ) ] ;
2019-08-23 12:57:18 -04:00
unsigned char real_iv [ 16 ] = { 0 } ;
2016-12-17 12:47:54 +01:00
int nrounds ;
2019-08-23 12:57:18 -04:00
uint8_t i = 0 ;
2016-12-17 12:47:54 +01:00
2019-08-23 12:57:18 -04:00
cli_dbgmsg ( " cli_pdf: aes_128cbc_encrypt: key length: %zu, data length: %zu \n " , key_n , in_length ) ;
if ( key_n > 16 ) {
cli_dbgmsg ( " cli_pdf: aes_128cbc_encrypt: key length is %zu! \n " , key_n * 8 ) ;
2016-12-17 12:47:54 +01:00
return ;
}
2019-08-23 12:57:18 -04:00
if ( in_length < 16 ) {
cli_dbgmsg ( " cli_pdf: aes_128cbc_encrypt: in_length is <16: %zu \n " , in_length ) ;
noisy_warnmsg ( " cli_pdf: aes_128cbc_encrypt: in_length is <16: %zu \n " , in_length ) ;
2016-12-17 12:47:54 +01:00
return ;
}
cli_dbgmsg ( " aes_128cbc_encrypt: Calling rijndaelSetupEncrypt \n " ) ;
nrounds = rijndaelSetupEncrypt ( rk , key , key_n * 8 ) ;
2019-08-23 12:57:18 -04:00
if ( ! nrounds ) {
2016-12-17 12:47:54 +01:00
cli_dbgmsg ( " cli_pdf: aes_128cbc_encrypt: nrounds = 0 \n " ) ;
return ;
}
cli_dbgmsg ( " aes_128cbc_encrypt: Beginning rijndaelEncrypt \n " ) ;
if ( iv )
memcpy ( real_iv , iv , sizeof ( real_iv ) ) ;
* out_length = 0 ;
2019-08-23 12:57:18 -04:00
while ( in_length > = 16 ) {
for ( i = 0 ; i < 16 ; i + + )
real_iv [ i ] ^ = in [ i ] ;
2016-12-17 12:47:54 +01:00
rijndaelEncrypt ( rk , nrounds , real_iv , real_iv ) ;
2019-08-23 12:57:18 -04:00
for ( i = 0 ; i < 16 ; i + + )
2016-12-17 12:47:54 +01:00
out [ i ] = real_iv [ i ] ;
out + = 16 ;
* out_length + = 16 ;
in + = 16 ;
in_length - = 16 ;
}
cli_dbgmsg ( " cli_pdf: aes_128cbc_encrypt: length is %zu \n " , * out_length ) ;
2011-12-14 15:43:14 +02:00
}
2017-08-16 17:31:45 -04:00
char * decrypt_any ( struct pdf_struct * pdf , uint32_t id , const char * in , size_t * length , enum enc_method enc_method )
2011-12-14 15:43:14 +02:00
{
unsigned char * key , * q , result [ 16 ] ;
unsigned n ;
struct arc4_state arc4 ;
2011-12-28 19:05:57 +02:00
if ( ! length | | ! * length | | ! in ) {
2020-05-12 17:25:00 -07:00
noisy_warnmsg ( " decrypt_any: decrypt failed for obj %u %u: Invalid arguments. \n " , id > > 8 , id & 0xff ) ;
return NULL ;
}
if ( NULL = = pdf - > key | | 0 = = pdf - > keylen ) {
noisy_warnmsg ( " decrypt_any: decrypt failed for obj %u %u: PDF key never identified. \n " , id > > 8 , id & 0xff ) ;
2014-04-07 16:39:54 -04:00
return NULL ;
2011-12-28 19:05:57 +02:00
}
2014-04-07 16:39:54 -04:00
2011-12-14 15:43:14 +02:00
n = pdf - > keylen + 5 ;
2011-12-15 17:24:36 +02:00
if ( enc_method = = ENC_AESV2 )
2014-04-07 16:39:54 -04:00
n + = 4 ;
2022-05-09 14:28:34 -07:00
key = cli_max_malloc ( n ) ;
2011-12-28 19:05:57 +02:00
if ( ! key ) {
2014-04-07 16:39:54 -04:00
noisy_warnmsg ( " decrypt_any: malloc failed \n " ) ;
return NULL ;
2011-12-28 19:05:57 +02:00
}
2011-12-14 15:43:14 +02:00
memcpy ( key , pdf - > key , pdf - > keylen ) ;
2018-12-03 12:40:13 -05:00
q = key + pdf - > keylen ;
2011-12-14 15:43:14 +02:00
* q + + = id > > 8 ;
* q + + = id > > 16 ;
* q + + = id > > 24 ;
* q + + = id ;
* q + + = 0 ;
2011-12-15 17:24:36 +02:00
if ( enc_method = = ENC_AESV2 )
2014-04-07 16:39:54 -04:00
memcpy ( q , " sAlT " , 4 ) ;
2014-02-08 00:31:12 -05:00
cl_hash_data ( " md5 " , key , n , result , NULL ) ;
2011-12-15 13:27:31 +02:00
free ( key ) ;
2011-12-14 15:43:14 +02:00
n = pdf - > keylen + 5 ;
if ( n > 16 )
2014-04-07 16:39:54 -04:00
n = 16 ;
2011-12-14 15:43:14 +02:00
2022-05-09 14:28:34 -07:00
q = cli_max_calloc ( * length , sizeof ( char ) ) ;
2011-12-28 19:05:57 +02:00
if ( ! q ) {
2014-04-07 16:39:54 -04:00
noisy_warnmsg ( " decrypt_any: malloc failed \n " ) ;
return NULL ;
2011-12-28 19:05:57 +02:00
}
2011-12-14 15:43:14 +02:00
2011-12-15 17:24:36 +02:00
switch ( enc_method ) {
2018-12-03 12:40:13 -05:00
case ENC_V2 :
cli_dbgmsg ( " cli_pdf: enc is v2 \n " ) ;
memcpy ( q , in , * length ) ;
2020-05-12 17:25:00 -07:00
if ( false = = arc4_init ( & arc4 , result , n ) ) {
noisy_warnmsg ( " decrypt_any: failed to init arc4 \n " ) ;
2020-07-15 08:39:32 -07:00
free ( q ) ;
2020-05-12 17:25:00 -07:00
return NULL ;
}
2018-12-03 12:40:13 -05:00
arc4_apply ( & arc4 , q , ( unsigned ) * length ) ; /* TODO: may truncate for very large lengths */
2014-04-07 16:39:54 -04:00
2018-12-03 12:40:13 -05:00
noisy_msg ( pdf , " decrypt_any: decrypted ARC4 data \n " ) ;
2014-04-07 16:39:54 -04:00
2018-12-03 12:40:13 -05:00
break ;
case ENC_AESV2 :
cli_dbgmsg ( " cli_pdf: enc is aesv2 \n " ) ;
2019-08-23 12:57:18 -04:00
aes_256cbc_decrypt ( ( const unsigned char * ) in , length , q , ( char * ) result , n , 1 ) ;
2014-04-07 16:39:54 -04:00
2018-12-03 12:40:13 -05:00
noisy_msg ( pdf , " decrypt_any: decrypted AES(v2) data \n " ) ;
2014-04-07 16:39:54 -04:00
2018-12-03 12:40:13 -05:00
break ;
case ENC_AESV3 :
cli_dbgmsg ( " decrypt_any: enc is aesv3 \n " ) ;
2014-04-07 16:39:54 -04:00
2019-08-23 12:57:18 -04:00
aes_256cbc_decrypt ( ( const unsigned char * ) in , length , q , pdf - > key , pdf - > keylen , 1 ) ;
2014-04-07 16:39:54 -04:00
2018-12-03 12:40:13 -05:00
noisy_msg ( pdf , " decrypted AES(v3) data \n " ) ;
2014-04-07 16:39:54 -04:00
2018-12-03 12:40:13 -05:00
break ;
case ENC_IDENTITY :
cli_dbgmsg ( " decrypt_any: enc is identity \n " ) ;
memcpy ( q , in , * length ) ;
2014-04-07 16:39:54 -04:00
2018-12-03 12:40:13 -05:00
noisy_msg ( pdf , " decrypt_any: identity encryption \n " ) ;
2014-04-07 16:39:54 -04:00
2018-12-03 12:40:13 -05:00
break ;
case ENC_NONE :
cli_dbgmsg ( " decrypt_any: enc is none \n " ) ;
noisy_msg ( pdf , " encryption is none \n " ) ;
free ( q ) ;
return NULL ;
case ENC_UNKNOWN :
cli_dbgmsg ( " decrypt_any: enc is unknown \n " ) ;
free ( q ) ;
noisy_warnmsg ( " decrypt_any: unknown encryption method for obj %u %u \n " ,
id > > 8 , id & 0xff ) ;
return NULL ;
2011-12-14 15:43:14 +02:00
}
2014-04-07 16:39:54 -04:00
2014-07-10 18:11:49 -04:00
return ( char * ) q ;
2011-12-14 15:43:14 +02:00
}
2015-03-20 15:10:52 -04:00
enum enc_method get_enc_method ( struct pdf_struct * pdf , struct pdf_obj * obj )
2011-12-15 17:24:36 +02:00
{
if ( obj - > flags & ( 1 < < OBJ_EMBEDDED_FILE ) )
2014-04-07 16:39:54 -04:00
return pdf - > enc_method_embeddedfile ;
2011-12-15 17:24:36 +02:00
if ( obj - > flags & ( 1 < < OBJ_STREAM ) )
2014-04-07 16:39:54 -04:00
return pdf - > enc_method_stream ;
2011-12-15 17:24:36 +02:00
return pdf - > enc_method_string ;
}
2011-12-23 17:40:22 +02:00
enum cstate {
CSTATE_NONE ,
CSTATE_TJ ,
CSTATE_TJ_PAROPEN
} ;
2019-05-04 15:54:54 -04:00
static void process ( struct text_norm_state * s , enum cstate * st , const char * buf , size_t length , int fout )
2011-12-23 17:40:22 +02:00
{
do {
2014-04-07 16:39:54 -04:00
switch ( * st ) {
2018-12-03 12:40:13 -05:00
case CSTATE_NONE :
if ( * buf = = ' [ ' ) {
* st = CSTATE_TJ ;
} else {
const char * nl = memchr ( buf , ' \n ' , length ) ;
if ( ! nl )
return ;
2014-04-07 16:39:54 -04:00
2019-05-04 15:54:54 -04:00
if ( ( size_t ) ( nl - buf ) > length ) {
length = 0 ;
} else {
length - = nl - buf ;
}
2018-12-03 12:40:13 -05:00
buf = nl ;
}
2014-04-07 16:39:54 -04:00
2018-12-03 12:40:13 -05:00
break ;
case CSTATE_TJ :
if ( * buf = = ' ( ' )
* st = CSTATE_TJ_PAROPEN ;
2014-04-07 16:39:54 -04:00
2018-12-03 12:40:13 -05:00
break ;
case CSTATE_TJ_PAROPEN :
if ( * buf = = ' ) ' ) {
* st = CSTATE_TJ ;
} else {
if ( text_normalize_buffer ( s , ( const unsigned char * ) buf , 1 ) ! = 1 ) {
cli_writen ( fout , s - > out , s - > out_pos ) ;
text_normalize_reset ( s ) ;
}
2014-04-07 16:39:54 -04:00
}
2018-12-03 12:40:13 -05:00
break ;
2014-04-07 16:39:54 -04:00
}
buf + + ;
2019-05-04 15:54:54 -04:00
if ( length > 0 )
length - - ;
2011-12-23 17:40:22 +02:00
} while ( length > 0 ) ;
}
2024-01-15 23:03:02 -05:00
static int pdf_scan_contents ( int fd , struct pdf_struct * pdf , struct pdf_obj * obj )
2011-12-23 17:40:22 +02:00
{
struct text_norm_state s ;
char fullname [ 1024 ] ;
char outbuff [ BUFSIZ ] ;
char inbuf [ BUFSIZ ] ;
2019-05-04 15:54:54 -04:00
int fout ;
size_t n ;
2018-08-14 14:00:31 -07:00
cl_error_t rc ;
2011-12-23 17:40:22 +02:00
enum cstate st = CSTATE_NONE ;
2024-01-15 23:03:02 -05:00
snprintf ( fullname , sizeof ( fullname ) , " %s " PATHSEP " pdf obj %d %d contents " , pdf - > dir , obj - > id > > 8 , obj - > id & 0xff ) ;
2018-12-03 12:40:13 -05:00
fout = open ( fullname , O_RDWR | O_CREAT | O_EXCL | O_TRUNC | O_BINARY , 0600 ) ;
2011-12-23 17:40:22 +02:00
if ( fout < 0 ) {
2014-04-07 16:39:54 -04:00
char err [ 128 ] ;
2018-08-14 14:00:31 -07:00
cli_errmsg ( " pdf_scan_contents: can't create temporary file %s: %s \n " , fullname , cli_strerror ( errno , err , sizeof ( err ) ) ) ;
2014-04-07 16:39:54 -04:00
return CL_ETMPFILE ;
2011-12-23 17:40:22 +02:00
}
2014-07-10 18:11:49 -04:00
text_normalize_init ( & s , ( unsigned char * ) outbuff , sizeof ( outbuff ) ) ;
2011-12-23 17:40:22 +02:00
while ( 1 ) {
2014-04-07 16:39:54 -04:00
n = cli_readn ( fd , inbuf , sizeof ( inbuf ) ) ;
2019-05-04 15:54:54 -04:00
if ( ( n = = 0 ) | | ( n = = ( size_t ) - 1 ) )
2014-04-07 16:39:54 -04:00
break ;
process ( & s , & st , inbuf , n , fout ) ;
2011-12-23 17:40:22 +02:00
}
2014-04-07 16:39:54 -04:00
2011-12-23 17:40:22 +02:00
cli_writen ( fout , s . out , s . out_pos ) ;
2011-12-28 19:05:57 +02:00
lseek ( fout , 0 , SEEK_SET ) ;
2022-03-09 22:26:40 -08:00
rc = cli_magic_scan_desc ( fout , fullname , pdf - > ctx , NULL , LAYER_ATTRIBUTES_NONE ) ;
2011-12-23 17:40:22 +02:00
close ( fout ) ;
2014-04-07 16:39:54 -04:00
Improve tmp sub-directory names
At present many parsers create tmp subdirectories to store extracted
files. For parsers like the vba parser, this is required as the
directory is later scanned. For other parsers, these subdirectories are
probably not helpful now that we provide recursive sub-dirs when
--leave-temps is enabled. It's not quite as simple as removing the extra
subdirectories, however. Certain parsers, like autoit, don't create very
unique filenames and would result in file name collisions when
--leave-temps is not enabled.
The best thing to do would be to make sure each parser uses unique
filenames and doesn't rely on cli_magic_scan_dir() to scan extracted
content before removing the extra subdirectory. In the meantime, this
commit gives the extra subdirectories meaningful names to improve
readability.
This commit also:
- Provides the 'bmp' prefix for extracted PE icons.
- Removes empty tmp subdirs when extracting rtf files, to eliminate
clutter.
- The PDF parser sometimes creates tmp files when decompressing streams
before it knows if there is actually any content to decompress. This
resulted in a large number of empty files. While it would be best to
avoid creating empty files in the first place, that's not quite as
as it sounds. This commit does the next best thing and deletes the
tmp files if nothing was actually extracted, even if --leave-temps is
enabled.
- Removes the "scantemp" prefix for unnamed fmaps scanned with
cli_magic_scan(). The 5-character hashes given to tmp files with
prefixes resulted in occasional file name collisions when extracting
certain file types with thousands of embedded files.
- The VBA and TAR parsers mistakenly used NAME_MAX instead of PATH_MAX,
resulting in truncated file paths and failed extraction when
--leave-temps is enabled and a lot of recursion is in play. This commit
switches them from NAME_MAX to PATH_MAX.
2020-03-27 16:06:22 -04:00
if ( ! pdf - > ctx - > engine - > keeptmp | | ( s . out_pos = = 0 ) )
2014-04-07 16:39:54 -04:00
if ( cli_unlink ( fullname ) & & rc ! = CL_VIRUS )
rc = CL_EUNLINK ;
2011-12-28 19:05:57 +02:00
return rc ;
2011-12-23 17:40:22 +02:00
}
2020-03-19 21:23:54 -04:00
cl_error_t pdf_extract_obj ( struct pdf_struct * pdf , struct pdf_obj * obj , uint32_t flags )
2010-05-11 10:37:10 +03:00
{
2025-04-07 16:50:09 -07:00
cl_error_t status = CL_SUCCESS ;
cl_error_t ret ;
Improve tmp sub-directory names
At present many parsers create tmp subdirectories to store extracted
files. For parsers like the vba parser, this is required as the
directory is later scanned. For other parsers, these subdirectories are
probably not helpful now that we provide recursive sub-dirs when
--leave-temps is enabled. It's not quite as simple as removing the extra
subdirectories, however. Certain parsers, like autoit, don't create very
unique filenames and would result in file name collisions when
--leave-temps is not enabled.
The best thing to do would be to make sure each parser uses unique
filenames and doesn't rely on cli_magic_scan_dir() to scan extracted
content before removing the extra subdirectory. In the meantime, this
commit gives the extra subdirectories meaningful names to improve
readability.
This commit also:
- Provides the 'bmp' prefix for extracted PE icons.
- Removes empty tmp subdirs when extracting rtf files, to eliminate
clutter.
- The PDF parser sometimes creates tmp files when decompressing streams
before it knows if there is actually any content to decompress. This
resulted in a large number of empty files. While it would be best to
avoid creating empty files in the first place, that's not quite as
as it sounds. This commit does the next best thing and deletes the
tmp files if nothing was actually extracted, even if --leave-temps is
enabled.
- Removes the "scantemp" prefix for unnamed fmaps scanned with
cli_magic_scan(). The 5-character hashes given to tmp files with
prefixes resulted in occasional file name collisions when extracting
certain file types with thousands of embedded files.
- The VBA and TAR parsers mistakenly used NAME_MAX instead of PATH_MAX,
resulting in truncated file paths and failed extraction when
--leave-temps is enabled and a lot of recursion is in play. This commit
switches them from NAME_MAX to PATH_MAX.
2020-03-27 16:06:22 -04:00
char fullname [ PATH_MAX + 1 ] ;
2025-04-07 16:50:09 -07:00
bool extracted_an_object = false ;
int fout = - 1 ;
size_t sum = 0 ;
bool dump = true ;
struct pdf_dict * dparams = NULL ;
2010-05-11 11:26:35 +03:00
2018-12-03 12:40:13 -05:00
cli_dbgmsg ( " pdf_extract_obj: obj %u %u \n " , obj - > id > > 8 , obj - > id & 0xff ) ;
2013-03-21 11:53:28 -04:00
2022-07-01 17:32:00 -07:00
if ( PDF_OBJECT_RECURSION_LIMIT < pdf - > parse_recursion_depth ) {
cli_dbgmsg ( " pdf_extract_obj: Recursion limit reached. \n " ) ;
2025-04-07 16:50:09 -07:00
status = CL_SUCCESS ;
goto done ;
2022-07-01 17:32:00 -07:00
}
2023-04-19 16:10:16 -07:00
if ( obj - > extracted ) {
// Should not attempt to extract the same object more than once.
2025-04-07 16:50:09 -07:00
status = CL_SUCCESS ;
goto done ;
2023-04-19 16:10:16 -07:00
}
// We're not done yet, but this is enough to say we've tried.
// Trying again won't help any.
obj - > extracted = true ;
2018-10-01 19:46:23 -04:00
if ( obj - > objstm ) {
2018-08-14 14:00:31 -07:00
cli_dbgmsg ( " pdf_extract_obj: extracting obj found in objstm. \n " ) ;
2018-10-01 19:46:23 -04:00
if ( obj - > objstm - > streambuf = = NULL ) {
cli_warnmsg ( " pdf_extract_obj: object in object stream has null stream buffer! \n " ) ;
2025-04-07 16:50:09 -07:00
status = CL_EFORMAT ;
goto done ;
2018-10-01 19:46:23 -04:00
}
}
2018-08-14 14:00:31 -07:00
2025-04-07 16:50:09 -07:00
/* Check to see if this is a URI referenced from a prior URI object */
if ( obj - > flags & ( 1 < < OBJ_URI ) ) {
URI_cb ( pdf , obj , NULL ) ;
status = CL_SUCCESS ;
goto done ;
}
2010-08-01 22:14:44 +03:00
/* TODO: call bytecode hook here, allow override dumpability */
2014-04-07 16:39:54 -04:00
if ( ( ! ( obj - > flags & ( 1 < < OBJ_STREAM ) ) | | ( obj - > flags & ( 1 < < OBJ_HASFILTERS ) ) ) & & ! ( obj - > flags & DUMP_MASK ) ) {
/* don't dump all streams */
2025-04-07 16:50:09 -07:00
dump = false ;
2010-05-11 11:26:35 +03:00
}
2014-04-07 16:39:54 -04:00
if ( ( obj - > flags & ( 1 < < OBJ_IMAGE ) ) & & ! ( obj - > flags & ( 1 < < OBJ_FILTER_DCT ) ) ) {
/* don't dump / scan non-JPG images */
2025-04-07 16:50:09 -07:00
dump = false ;
2010-08-01 22:14:44 +03:00
}
2014-04-07 16:39:54 -04:00
2010-08-02 15:42:58 +03:00
if ( obj - > flags & ( 1 < < OBJ_FORCEDUMP ) ) {
2014-04-07 16:39:54 -04:00
/* bytecode can force dump by setting this flag */
2025-04-07 16:50:09 -07:00
dump = true ;
2010-08-02 15:42:58 +03:00
}
2014-04-07 16:39:54 -04:00
2025-04-07 16:50:09 -07:00
if ( ! dump ) {
status = CL_SUCCESS ;
goto done ;
}
2014-04-07 16:39:54 -04:00
2018-12-03 12:40:13 -05:00
cli_dbgmsg ( " pdf_extract_obj: dumping obj %u %u \n " , obj - > id > > 8 , obj - > id & 0xff ) ;
2014-04-07 16:39:54 -04:00
2024-01-15 23:03:02 -05:00
snprintf ( fullname , sizeof ( fullname ) , " %s " PATHSEP " pdf obj %d %d " , pdf - > dir , obj - > id > > 8 , obj - > id & 0xff ) ;
2018-12-03 12:40:13 -05:00
fout = open ( fullname , O_RDWR | O_CREAT | O_EXCL | O_TRUNC | O_BINARY , 0600 ) ;
2010-05-11 11:26:35 +03:00
if ( fout < 0 ) {
2014-04-07 16:39:54 -04:00
char err [ 128 ] ;
2018-08-14 14:00:31 -07:00
cli_errmsg ( " pdf_extract_obj: can't create temporary file %s: %s \n " , fullname , cli_strerror ( errno , err , sizeof ( err ) ) ) ;
2025-04-07 16:50:09 -07:00
status = CL_ETMPFILE ;
goto done ;
2010-05-11 11:26:35 +03:00
}
2025-04-07 16:50:09 -07:00
extracted_an_object = true ;
2023-04-19 16:10:16 -07:00
if ( ! ( flags & PDF_EXTRACT_OBJ_SCAN ) ) {
2025-04-07 16:50:09 -07:00
/*
* When PDF_EXTRACT_OBJ_SCAN is not set , this function is used to extract the object to a temp file
* and so we need to save off the path in obj - > path for the caller to use .
*/
2023-04-19 16:10:16 -07:00
if ( NULL ! = obj - > path ) {
obj - > path = strdup ( fullname ) ;
}
}
2014-06-10 13:21:31 -04:00
2019-03-05 21:15:41 -05:00
if ( ( NULL = = obj - > objstm ) & &
( obj - > flags & ( 1 < < OBJ_STREAM ) ) ) {
/*
* Object contains a stream . Parse this now .
*/
cli_dbgmsg ( " pdf_extract_obj: parsing a stream in obj %u %u \n " , obj - > id > > 8 , obj - > id & 0xff ) ;
2014-04-07 16:39:54 -04:00
2019-03-05 21:15:41 -05:00
const char * start = pdf - > map + obj - > start ;
2018-08-14 14:00:31 -07:00
2019-03-05 21:15:41 -05:00
size_t length ;
size_t orig_length ;
int dict_len = obj - > stream - start ; /* Dictionary should end where the stream begins */
2012-01-18 20:58:38 +02:00
2019-03-05 21:15:41 -05:00
const char * pstr ;
struct objstm_struct * objstm = NULL ;
int xref = 0 ;
2014-04-07 16:39:54 -04:00
2019-03-05 21:15:41 -05:00
/* Find and interpret the length dictionary value */
length = find_length ( pdf , obj , start , dict_len ) ;
2014-04-07 16:39:54 -04:00
2019-03-05 21:15:41 -05:00
orig_length = length ;
2014-04-07 16:39:54 -04:00
2019-03-05 21:15:41 -05:00
if ( length > obj - > stream_size ) {
cli_dbgmsg ( " cli_pdf: Stream length exceeds object length by %zu bytes. Length truncated to %zu bytes \n " , length - obj - > stream_size , obj - > stream_size ) ;
noisy_warnmsg ( " Stream length exceeds object length by %zu bytes. Length truncated to %zu bytes \n " , length - obj - > stream_size , obj - > stream_size ) ;
2014-04-07 16:39:54 -04:00
2019-03-05 21:15:41 -05:00
length = obj - > stream_size ;
}
2014-04-07 16:39:54 -04:00
2021-04-06 10:05:10 -07:00
if ( ! ( obj - > flags & ( 1 < < OBJ_FILTER_FLATE ) ) & & ( length = = 0 ) ) {
2019-03-05 21:15:41 -05:00
/*
* If the length is unknown and this doesn ' t contain a FLATE encoded filter . . .
* Calculate the length using the stream size , and trimming
* off any newline / carriage returns from the end of the stream .
*/
const char * q = start + obj - > stream_size ;
length = obj - > stream_size ;
q - - ;
2021-04-06 10:05:10 -07:00
if ( length > 0 ) {
if ( * q = = ' \n ' ) {
q - - ;
length - - ;
2019-03-05 21:15:41 -05:00
2021-04-06 10:05:10 -07:00
if ( length > 0 & & * q = = ' \r ' )
length - - ;
} else if ( * q = = ' \r ' ) {
2019-03-05 21:15:41 -05:00
length - - ;
2021-04-06 10:05:10 -07:00
}
2019-03-05 21:15:41 -05:00
}
2014-04-07 16:39:54 -04:00
2025-06-30 10:47:02 -04:00
cli_dbgmsg ( " pdf_extract_obj: calculated length %zu \n " , length ) ;
2019-03-05 21:15:41 -05:00
} else {
if ( obj - > stream_size > ( size_t ) length + 2 ) {
cli_dbgmsg ( " cli_pdf: calculated length %zu < %zu \n " ,
2025-06-30 10:47:02 -04:00
length , obj - > stream_size ) ;
2019-03-05 21:15:41 -05:00
length = obj - > stream_size ;
}
}
2014-04-07 16:39:54 -04:00
2025-06-30 10:47:02 -04:00
if ( ( 0 ! = orig_length ) & & ( obj - > stream_size > orig_length + 20 ) ) {
cli_dbgmsg ( " pdf_extract_obj: orig length: %zu, length: %zu, size: %zu \n " ,
orig_length , length , obj - > stream_size ) ;
2019-03-05 21:15:41 -05:00
pdfobj_flag ( pdf , obj , BAD_STREAMLEN ) ;
}
2016-04-18 17:11:12 -04:00
2019-03-05 21:15:41 -05:00
if ( 0 = = length ) {
length = obj - > stream_size ;
if ( 0 = = length ) {
cli_dbgmsg ( " pdf_extract_obj: Alleged or calculated stream length and stream buffer size both 0 \n " ) ;
2025-04-07 16:50:09 -07:00
/* Empty stream, nothing to scan */
status = CL_SUCCESS ;
goto done ;
2019-03-05 21:15:41 -05:00
}
}
2014-04-07 16:39:54 -04:00
2019-03-05 21:15:41 -05:00
/* Check if XRef is enabled */
if ( cli_memstr ( start , dict_len , " /XRef " , strlen ( " /XRef " ) ) ) {
xref = 1 ;
}
2014-04-07 16:39:54 -04:00
2019-03-05 21:15:41 -05:00
/*
* Identify the DecodeParms , if available .
*/
if ( NULL ! = ( pstr = pdf_getdict ( start , & dict_len , " /DecodeParms " ) ) ) {
cli_dbgmsg ( " pdf_extract_obj: Found /DecodeParms \n " ) ;
} else if ( NULL ! = ( pstr = pdf_getdict ( start , & dict_len , " /DP " ) ) ) {
cli_dbgmsg ( " pdf_extract_obj: Found /DP \n " ) ;
}
2016-04-13 18:29:55 -04:00
2019-03-05 21:15:41 -05:00
if ( pstr ) {
/* shift pstr left to "<<" for pdf_parse_dict */
while ( ( * pstr = = ' < ' ) & & ( pstr > start ) ) {
pstr - - ;
dict_len + + ;
}
2014-04-07 16:39:54 -04:00
2019-03-05 21:15:41 -05:00
/* shift pstr right to "<<" for pdf_parse_dict */
while ( ( * pstr ! = ' < ' ) & & ( dict_len > 0 ) ) {
pstr + + ;
dict_len - - ;
}
2022-07-01 17:32:00 -07:00
if ( dict_len > 4 ) {
pdf - > parse_recursion_depth + + ;
2019-03-05 21:15:41 -05:00
dparams = pdf_parse_dict ( pdf , obj , obj - > size , ( char * ) pstr , NULL ) ;
2022-07-01 17:32:00 -07:00
pdf - > parse_recursion_depth - - ;
} else {
2019-03-05 21:15:41 -05:00
cli_dbgmsg ( " pdf_extract_obj: failed to locate DecodeParms dictionary start \n " ) ;
2022-07-01 17:32:00 -07:00
}
2019-03-05 21:15:41 -05:00
}
/*
* Go back to the start of the dictionary and check to see if the stream
* is an object stream . If so , collect the relevant info .
*/
dict_len = obj - > stream - start ;
if ( NULL ! = ( pstr = pdf_getdict ( start , & dict_len , " /Type/ObjStm " ) ) ) {
2025-06-30 10:47:02 -04:00
int objstm_first = - 1 ;
int objstm_length = - 1 ;
int objstm_n = - 1 ;
2019-03-05 21:15:41 -05:00
cli_dbgmsg ( " pdf_extract_obj: Found /Type/ObjStm \n " ) ;
dict_len = obj - > stream - start ;
2025-06-30 10:47:02 -04:00
if ( - 1 = = ( objstm_first = pdf_readint ( start , dict_len , " /First " ) ) ) {
2019-03-05 21:15:41 -05:00
cli_warnmsg ( " pdf_extract_obj: Failed to find offset of first object in object stream \n " ) ;
2025-06-30 10:47:02 -04:00
} else if ( - 1 = = ( objstm_length = pdf_readint ( start , dict_len , " /Length " ) ) ) {
2019-03-05 21:15:41 -05:00
cli_warnmsg ( " pdf_extract_obj: Failed to find length of object stream \n " ) ;
2025-06-30 10:47:02 -04:00
} else if ( - 1 = = ( objstm_n = pdf_readint ( start , dict_len , " /N " ) ) ) {
2019-03-05 21:15:41 -05:00
cli_warnmsg ( " pdf_extract_obj: Failed to find num objects in object stream \n " ) ;
} else {
/* Add objstm to pdf struct, so it can be freed eventually */
pdf - > nobjstms + + ;
2024-01-09 17:44:33 -05:00
pdf - > objstms = cli_max_realloc_or_free ( pdf - > objstms , sizeof ( struct objstm_struct * ) * pdf - > nobjstms ) ;
2019-03-05 21:15:41 -05:00
if ( ! pdf - > objstms ) {
cli_warnmsg ( " pdf_extract_obj: out of memory parsing object stream (%u) \n " , pdf - > nobjstms ) ;
2025-04-07 16:50:09 -07:00
status = CL_EMEM ;
goto done ;
2018-08-14 14:00:31 -07:00
}
2025-08-27 14:22:35 -04:00
CLI_CALLOC_OR_GOTO_DONE (
objstm , 1 , sizeof ( struct objstm_struct ) ,
cli_warnmsg ( " pdf_extract_obj: out of memory parsing object stream (%u) \n " , pdf - > nobjstms ) ,
status = CL_EMEM ) ;
2018-08-14 14:00:31 -07:00
2025-08-27 14:22:35 -04:00
pdf - > objstms [ pdf - > nobjstms - 1 ] = objstm ;
2018-08-14 14:00:31 -07:00
2025-06-30 10:47:02 -04:00
objstm - > first = ( size_t ) objstm_first ;
objstm - > current = ( size_t ) objstm_first ;
2019-03-05 21:15:41 -05:00
objstm - > current_pair = 0 ;
2025-06-30 10:47:02 -04:00
objstm - > length = ( size_t ) objstm_length ;
objstm - > n = ( size_t ) objstm_n ;
2018-08-14 14:00:31 -07:00
2025-06-30 10:47:02 -04:00
cli_dbgmsg ( " pdf_extract_obj: ObjStm first obj at offset %zu \n " , objstm - > first ) ;
cli_dbgmsg ( " pdf_extract_obj: ObjStm length is %zu bytes \n " , objstm - > length ) ;
cli_dbgmsg ( " pdf_extract_obj: ObjStm should contain %zu objects \n " , objstm - > n ) ;
2019-03-05 21:15:41 -05:00
}
}
2025-04-07 16:50:09 -07:00
sum = pdf_decodestream ( pdf , obj , dparams , obj - > stream , ( uint32_t ) length , xref , fout , & status , objstm ) ;
if ( ( CL_SUCCESS ! = status ) & & ( CL_VIRUS ! = status ) ) {
cli_dbgmsg ( " Error decoding stream! Error code: %d \n " , status ) ;
2019-03-05 21:15:41 -05:00
/* It's ok if we couldn't decode the stream,
2020-01-31 11:52:00 -08:00
* make a best effort to keep parsing . . .
* Unless we were unable to allocate memory . */
2025-04-07 16:50:09 -07:00
if ( CL_EMEM = = status ) {
goto done ;
2020-01-31 11:52:00 -08:00
}
2025-04-07 16:50:09 -07:00
if ( CL_EPARSE = = status ) {
status = CL_SUCCESS ;
2020-01-31 11:52:00 -08:00
}
2018-08-14 14:00:31 -07:00
2019-03-05 21:15:41 -05:00
if ( NULL ! = objstm ) {
/*
* If we were expecting an objstm and there was a failure . . .
* discard the memory for last object stream .
*/
if ( NULL ! = pdf - > objstms ) {
if ( NULL ! = pdf - > objstms [ pdf - > nobjstms - 1 ] ) {
if ( NULL ! = pdf - > objstms [ pdf - > nobjstms - 1 ] - > streambuf ) {
free ( pdf - > objstms [ pdf - > nobjstms - 1 ] - > streambuf ) ;
pdf - > objstms [ pdf - > nobjstms - 1 ] - > streambuf = NULL ;
}
free ( pdf - > objstms [ pdf - > nobjstms - 1 ] ) ;
pdf - > objstms [ pdf - > nobjstms - 1 ] = NULL ;
2018-08-14 14:00:31 -07:00
}
2019-03-05 21:15:41 -05:00
/* Pop the objstm off the end of the pdf->objstms array. */
if ( pdf - > nobjstms > 0 ) {
pdf - > nobjstms - - ;
if ( 0 = = pdf - > nobjstms ) {
free ( pdf - > objstms ) ;
pdf - > objstms = NULL ;
} else {
2024-01-09 17:44:33 -05:00
pdf - > objstms = cli_max_realloc_or_free ( pdf - > objstms , sizeof ( struct objstm_struct * ) * pdf - > nobjstms ) ;
2018-08-14 14:00:31 -07:00
2019-03-05 21:15:41 -05:00
if ( ! pdf - > objstms ) {
cli_warnmsg ( " pdf_extract_obj: out of memory when shrinking down objstm array \n " ) ;
2025-04-07 16:50:09 -07:00
status = CL_EMEM ;
goto done ;
2018-08-14 14:00:31 -07:00
}
}
2019-03-05 21:15:41 -05:00
} else {
/* hm.. this shouldn't happen */
cli_warnmsg ( " pdf_extract_obj: Failure counting objstms. \n " ) ;
2018-08-14 14:00:31 -07:00
}
2014-04-07 16:39:54 -04:00
}
2019-03-05 21:15:41 -05:00
}
}
2014-04-07 16:39:54 -04:00
2025-04-07 16:50:09 -07:00
if ( dparams ) {
2019-03-05 21:15:41 -05:00
pdf_free_dict ( dparams ) ;
2025-04-07 16:50:09 -07:00
dparams = NULL ;
}
2014-04-07 16:39:54 -04:00
2025-04-07 16:50:09 -07:00
if ( status = = CL_VIRUS ) {
/* skip post-filter scan */
2019-03-05 21:15:41 -05:00
goto done ;
}
2012-01-18 20:58:38 +02:00
2019-03-05 21:15:41 -05:00
} else if ( obj - > flags & ( 1 < < OBJ_JAVASCRIPT ) ) {
const char * q2 ;
const char * q = ( obj - > objstm ) ? ( const char * ) ( obj - > start + obj - > objstm - > streambuf )
: ( const char * ) ( obj - > start + pdf - > map ) ;
2018-08-14 14:00:31 -07:00
2019-03-05 21:15:41 -05:00
/* TODO: get obj-endobj size */
off_t bytesleft = obj - > size ;
2014-04-07 16:39:54 -04:00
2019-03-05 21:15:41 -05:00
if ( bytesleft < 0 ) {
2025-04-07 16:50:09 -07:00
goto scan_extracted_objects ;
2019-03-05 21:15:41 -05:00
}
2014-04-07 16:39:54 -04:00
2019-03-05 21:15:41 -05:00
do {
char * js = NULL ;
size_t js_len = 0 ;
const char * q3 ;
2014-04-07 16:39:54 -04:00
2019-03-05 21:15:41 -05:00
q2 = cli_memstr ( q , bytesleft , " /JavaScript " , 11 ) ;
if ( ! q2 )
break ;
2014-04-07 16:39:54 -04:00
2019-03-05 21:15:41 -05:00
bytesleft - = q2 - q + 11 ;
q = q2 + 11 ;
2014-04-07 16:39:54 -04:00
2019-03-05 21:15:41 -05:00
js = pdf_readstring ( q , bytesleft , " /JS " , NULL , & q2 , ! ( pdf - > flags & ( 1 < < DECRYPTABLE_PDF ) ) ) ;
bytesleft - = q2 - q ;
q = q2 ;
2014-04-07 16:39:54 -04:00
2019-03-05 21:15:41 -05:00
if ( js ) {
char * decrypted = NULL ;
const char * out = js ;
js_len = strlen ( js ) ;
if ( pdf - > flags & ( 1 < < DECRYPTABLE_PDF ) ) {
cli_dbgmsg ( " pdf_extract_obj: encrypted string \n " ) ;
decrypted = decrypt_any ( pdf , obj - > id , js , & js_len , pdf - > enc_method_string ) ;
2014-04-07 16:39:54 -04:00
2019-03-05 21:15:41 -05:00
if ( decrypted ) {
noisy_msg ( pdf , " pdf_extract_obj: decrypted Javascript string from obj %u %u \n " , obj - > id > > 8 , obj - > id & 0xff ) ;
out = decrypted ;
2014-04-07 16:39:54 -04:00
}
2019-03-05 21:15:41 -05:00
}
2014-04-07 16:39:54 -04:00
2025-06-08 01:12:33 -04:00
if ( ( pdf - > ctx - > options - > general & CL_SCAN_GENERAL_COLLECT_METADATA ) & & pdf - > ctx - > this_layer_metadata_json ! = NULL ) {
2020-07-01 16:56:26 -07:00
struct json_object * pdfobj , * jbig2arr ;
2025-06-08 01:12:33 -04:00
if ( NULL = = ( pdfobj = cli_jsonobj ( pdf - > ctx - > this_layer_metadata_json , " PDFStats " ) ) ) {
2020-07-01 16:56:26 -07:00
cli_errmsg ( " pdf_extract_obj: failed to get PDFStats JSON object \n " ) ;
} else if ( NULL = = ( jbig2arr = cli_jsonarray ( pdfobj , " JavascriptObjects " ) ) ) {
cli_errmsg ( " pdf_extract_obj: failed to get JavascriptObjects JSON object \n " ) ;
} else {
cli_jsonint_array ( jbig2arr , obj - > id > > 8 ) ;
}
}
2024-03-25 13:01:46 -04:00
2020-07-01 16:56:26 -07:00
pdf - > stats . njs + + ;
2019-03-05 21:15:41 -05:00
if ( filter_writen ( pdf , obj , fout , out , js_len , ( size_t * ) & sum ) ! = js_len ) {
2025-04-07 16:50:09 -07:00
status = CL_EWRITE ;
2014-04-07 16:39:54 -04:00
free ( js ) ;
2019-03-05 21:15:41 -05:00
break ;
}
2014-04-07 16:39:54 -04:00
2019-03-05 21:15:41 -05:00
free ( decrypted ) ;
free ( js ) ;
cli_dbgmsg ( " pdf_extract_obj: bytesleft: %d \n " , ( int ) bytesleft ) ;
2014-04-07 16:39:54 -04:00
2019-03-05 21:15:41 -05:00
if ( bytesleft > 0 ) {
q2 = pdf_nextobject ( q , bytesleft ) ;
if ( ! q2 )
q2 = q + bytesleft - 1 ;
2014-04-07 16:39:54 -04:00
2019-03-05 21:15:41 -05:00
/* non-conforming PDFs that don't escape ) properly */
q3 = memchr ( q , ' ) ' , bytesleft ) ;
if ( q3 & & q3 < q2 )
q2 = q3 ;
2014-04-07 16:39:54 -04:00
2019-03-05 21:15:41 -05:00
while ( q2 > q & & q2 [ - 1 ] = = ' ' )
q2 - - ;
if ( q2 > q ) {
q - - ;
filter_writen ( pdf , obj , fout , q , q2 - q , ( size_t * ) & sum ) ;
q + + ;
2014-04-07 16:39:54 -04:00
}
}
2019-03-05 21:15:41 -05:00
}
2014-04-07 16:39:54 -04:00
2019-03-05 21:15:41 -05:00
} while ( bytesleft > 0 ) ;
} else {
off_t bytesleft = obj - > size ;
if ( bytesleft < 0 )
2025-04-07 16:50:09 -07:00
status = CL_EFORMAT ;
2019-03-05 21:15:41 -05:00
else {
if ( obj - > objstm ) {
2025-04-07 16:50:09 -07:00
if ( filter_writen ( pdf , obj , fout , obj - > objstm - > streambuf + obj - > start , bytesleft , ( size_t * ) & sum ) ! = ( size_t ) bytesleft ) {
status = CL_EWRITE ;
}
2019-03-05 21:15:41 -05:00
} else {
2025-04-07 16:50:09 -07:00
if ( filter_writen ( pdf , obj , fout , pdf - > map + obj - > start , bytesleft , ( size_t * ) & sum ) ! = ( size_t ) bytesleft ) {
status = CL_EWRITE ;
}
2018-10-25 13:06:15 -07:00
}
2014-04-07 16:39:54 -04:00
}
2019-03-05 21:15:41 -05:00
}
2025-04-07 16:50:09 -07:00
scan_extracted_objects :
2014-04-07 16:39:54 -04:00
2018-12-03 12:40:13 -05:00
cli_dbgmsg ( " pdf_extract_obj: extracted %td bytes %u %u obj \n " , sum , obj - > id > > 8 , obj - > id & 0xff ) ;
2018-08-14 14:00:31 -07:00
cli_dbgmsg ( " pdf_extract_obj: ... to %s \n " , fullname ) ;
2014-04-07 16:39:54 -04:00
2025-04-07 16:50:09 -07:00
if ( ( flags & PDF_EXTRACT_OBJ_SCAN ) & & ( sum > 0 ) ) {
/*
* Scan the extracted objects for potential threats .
* PDF_EXTRACT_OBJ_SCAN is used when the extracted object should be scanned and then deleted .
*/
2014-04-07 16:39:54 -04:00
/* TODO: invoke bytecode on this pdf obj with metainformation associated */
lseek ( fout , 0 , SEEK_SET ) ;
2025-04-07 16:50:09 -07:00
ret = cli_magic_scan_desc ( fout , fullname , pdf - > ctx , NULL , LAYER_ATTRIBUTES_NONE ) ;
if ( ret ! = CL_SUCCESS ) {
status = ret ;
goto done ;
2022-08-23 16:59:14 -07:00
}
2014-04-07 16:39:54 -04:00
2025-04-07 16:50:09 -07:00
if ( ( status = = CL_CLEAN ) | | ( status = = CL_VIRUS ) ) {
2025-06-08 01:12:33 -04:00
ret = run_pdf_hooks ( pdf , PDF_PHASE_POSTDUMP , fout , fullname ) ;
2025-04-07 16:50:09 -07:00
if ( ret = = CL_VIRUS ) {
status = ret ;
goto done ;
2022-08-23 16:59:14 -07:00
}
2014-04-07 16:39:54 -04:00
}
2025-04-07 16:50:09 -07:00
if ( ( ( status = = CL_CLEAN ) | | ( status = = CL_VIRUS ) ) & & ( obj - > flags & ( 1 < < OBJ_CONTENTS ) ) ) {
2014-04-07 16:39:54 -04:00
lseek ( fout , 0 , SEEK_SET ) ;
2022-08-23 16:59:14 -07:00
cli_dbgmsg ( " pdf_extract_obj: dumping contents from obj %u %u \n " , obj - > id > > 8 , obj - > id & 0xff ) ;
2014-04-07 16:39:54 -04:00
2025-04-07 16:50:09 -07:00
ret = pdf_scan_contents ( fout , pdf , obj ) ;
if ( ret ! = CL_SUCCESS ) {
status = ret ;
goto done ;
2022-08-23 16:59:14 -07:00
}
2014-04-07 16:39:54 -04:00
}
2010-08-02 15:42:58 +03:00
}
2014-04-07 16:39:54 -04:00
2025-04-07 16:50:09 -07:00
done :
2014-04-07 16:39:54 -04:00
2025-04-07 16:50:09 -07:00
if ( NULL ! = dparams ) {
pdf_free_dict ( dparams ) ;
2020-01-31 11:52:00 -08:00
}
2014-04-07 16:39:54 -04:00
2025-04-07 16:50:09 -07:00
if ( - 1 ! = fout ) {
close ( fout ) ;
}
if ( extracted_an_object & & ( flags & PDF_EXTRACT_OBJ_SCAN ) & & ! pdf - > ctx - > engine - > keeptmp ) {
/*
* When PDF_EXTRACT_OBJ_SCAN is set , the goal is to extract , scan , and delete it .
* If it was not set , we would keep it and the path is passed back obj - > path for the caller to use .
* That ' s why we wouldn ' t unlink it here .
*/
if ( cli_unlink ( fullname ) & & status ! = CL_VIRUS ) {
status = CL_EUNLINK ;
}
}
return status ;
2010-05-11 10:37:10 +03:00
}
2010-05-10 23:41:34 +03:00
enum objstate {
STATE_NONE ,
STATE_S ,
STATE_FILTER ,
2010-05-11 10:37:10 +03:00
STATE_JAVASCRIPT ,
2010-07-30 14:23:10 +03:00
STATE_OPENACTION ,
2010-07-30 20:26:59 +03:00
STATE_LINEARIZED ,
2011-10-08 12:20:21 +03:00
STATE_LAUNCHACTION ,
2011-12-23 17:40:22 +02:00
STATE_CONTENTS ,
2025-04-07 16:50:09 -07:00
STATE_URI ,
2010-05-10 23:41:34 +03:00
STATE_ANY /* for actions table below */
} ;
2018-12-03 12:40:13 -05:00
# define NAMEFLAG_NONE 0x0
# define NAMEFLAG_HEURISTIC 0x1
2014-07-30 14:20:45 -04:00
2010-05-10 23:41:34 +03:00
struct pdfname_action {
const char * pdfname ;
2018-12-03 12:40:13 -05:00
enum pdf_objflags set_objflag ; /* OBJ_DICT is noop */
enum objstate from_state ; /* STATE_NONE is noop */
2010-05-10 23:41:34 +03:00
enum objstate to_state ;
2014-07-30 14:20:45 -04:00
uint32_t nameflags ;
2014-06-25 14:06:17 -04:00
void ( * pdf_stats_cb ) ( struct pdf_struct * pdf , struct pdf_obj * obj , struct pdfname_action * act ) ;
2010-05-10 23:41:34 +03:00
} ;
static struct pdfname_action pdfname_actions [ ] = {
2014-07-30 14:20:45 -04:00
{ " ASCIIHexDecode " , OBJ_FILTER_AH , STATE_FILTER , STATE_FILTER , NAMEFLAG_HEURISTIC , ASCIIHexDecode_cb } ,
{ " ASCII85Decode " , OBJ_FILTER_A85 , STATE_FILTER , STATE_FILTER , NAMEFLAG_HEURISTIC , ASCII85Decode_cb } ,
{ " A85 " , OBJ_FILTER_A85 , STATE_FILTER , STATE_FILTER , NAMEFLAG_HEURISTIC , ASCII85Decode_cb } ,
{ " AHx " , OBJ_FILTER_AH , STATE_FILTER , STATE_FILTER , NAMEFLAG_HEURISTIC , ASCIIHexDecode_cb } ,
{ " EmbeddedFile " , OBJ_EMBEDDED_FILE , STATE_NONE , STATE_NONE , NAMEFLAG_HEURISTIC , EmbeddedFile_cb } ,
{ " FlateDecode " , OBJ_FILTER_FLATE , STATE_FILTER , STATE_FILTER , NAMEFLAG_HEURISTIC , FlateDecode_cb } ,
{ " Fl " , OBJ_FILTER_FLATE , STATE_FILTER , STATE_FILTER , NAMEFLAG_HEURISTIC , FlateDecode_cb } ,
{ " Image " , OBJ_IMAGE , STATE_NONE , STATE_NONE , NAMEFLAG_HEURISTIC , Image_cb } ,
{ " LZWDecode " , OBJ_FILTER_LZW , STATE_FILTER , STATE_FILTER , NAMEFLAG_HEURISTIC , LZWDecode_cb } ,
{ " LZW " , OBJ_FILTER_LZW , STATE_FILTER , STATE_FILTER , NAMEFLAG_HEURISTIC , LZWDecode_cb } ,
{ " RunLengthDecode " , OBJ_FILTER_RL , STATE_FILTER , STATE_FILTER , NAMEFLAG_HEURISTIC , RunLengthDecode_cb } ,
{ " RL " , OBJ_FILTER_RL , STATE_FILTER , STATE_FILTER , NAMEFLAG_HEURISTIC , RunLengthDecode_cb } ,
{ " CCITTFaxDecode " , OBJ_FILTER_FAX , STATE_FILTER , STATE_FILTER , NAMEFLAG_HEURISTIC , CCITTFaxDecode_cb } ,
{ " CCF " , OBJ_FILTER_FAX , STATE_FILTER , STATE_FILTER , NAMEFLAG_HEURISTIC , CCITTFaxDecode_cb } ,
{ " JBIG2Decode " , OBJ_FILTER_DCT , STATE_FILTER , STATE_FILTER , NAMEFLAG_HEURISTIC , JBIG2Decode_cb } ,
{ " DCTDecode " , OBJ_FILTER_DCT , STATE_FILTER , STATE_FILTER , NAMEFLAG_HEURISTIC , DCTDecode_cb } ,
{ " DCT " , OBJ_FILTER_DCT , STATE_FILTER , STATE_FILTER , NAMEFLAG_HEURISTIC , DCTDecode_cb } ,
{ " JPXDecode " , OBJ_FILTER_JPX , STATE_FILTER , STATE_FILTER , NAMEFLAG_HEURISTIC , JPXDecode_cb } ,
2018-12-03 12:40:13 -05:00
{ " Crypt " , OBJ_FILTER_CRYPT , STATE_FILTER , STATE_NONE , NAMEFLAG_HEURISTIC , Crypt_cb } ,
2014-07-30 14:20:45 -04:00
{ " Standard " , OBJ_FILTER_STANDARD , STATE_FILTER , STATE_FILTER , NAMEFLAG_HEURISTIC , Standard_cb } ,
2018-12-03 12:40:13 -05:00
{ " Sig " , OBJ_SIGNED , STATE_ANY , STATE_NONE , NAMEFLAG_HEURISTIC , Sig_cb } ,
{ " V " , OBJ_SIGNED , STATE_ANY , STATE_NONE , NAMEFLAG_HEURISTIC , NULL } ,
{ " R " , OBJ_SIGNED , STATE_ANY , STATE_NONE , NAMEFLAG_HEURISTIC , NULL } ,
2014-07-30 14:20:45 -04:00
{ " Linearized " , OBJ_DICT , STATE_NONE , STATE_LINEARIZED , NAMEFLAG_HEURISTIC , NULL } ,
{ " Filter " , OBJ_HASFILTERS , STATE_ANY , STATE_FILTER , NAMEFLAG_HEURISTIC , NULL } ,
2020-07-01 16:56:26 -07:00
{ " JavaScript " , OBJ_JAVASCRIPT , STATE_ANY , STATE_JAVASCRIPT , NAMEFLAG_HEURISTIC , JavaScript_cb } ,
2014-07-30 14:20:45 -04:00
{ " Length " , OBJ_DICT , STATE_FILTER , STATE_NONE , NAMEFLAG_HEURISTIC , NULL } ,
{ " S " , OBJ_DICT , STATE_NONE , STATE_S , NAMEFLAG_HEURISTIC , NULL } ,
{ " Type " , OBJ_DICT , STATE_NONE , STATE_NONE , NAMEFLAG_HEURISTIC , NULL } ,
{ " OpenAction " , OBJ_OPENACTION , STATE_ANY , STATE_OPENACTION , NAMEFLAG_HEURISTIC , OpenAction_cb } ,
{ " Launch " , OBJ_LAUNCHACTION , STATE_ANY , STATE_LAUNCHACTION , NAMEFLAG_HEURISTIC , Launch_cb } ,
{ " Page " , OBJ_PAGE , STATE_NONE , STATE_NONE , NAMEFLAG_HEURISTIC , Page_cb } ,
{ " Contents " , OBJ_CONTENTS , STATE_NONE , STATE_CONTENTS , NAMEFLAG_HEURISTIC , NULL } ,
{ " Author " , OBJ_DICT , STATE_NONE , STATE_NONE , NAMEFLAG_NONE , Author_cb } ,
{ " Producer " , OBJ_DICT , STATE_NONE , STATE_NONE , NAMEFLAG_NONE , Producer_cb } ,
{ " CreationDate " , OBJ_DICT , STATE_NONE , STATE_NONE , NAMEFLAG_NONE , CreationDate_cb } ,
{ " ModDate " , OBJ_DICT , STATE_NONE , STATE_NONE , NAMEFLAG_NONE , ModificationDate_cb } ,
{ " Creator " , OBJ_DICT , STATE_NONE , STATE_NONE , NAMEFLAG_NONE , Creator_cb } ,
{ " Title " , OBJ_DICT , STATE_NONE , STATE_NONE , NAMEFLAG_NONE , Title_cb } ,
{ " Keywords " , OBJ_DICT , STATE_NONE , STATE_NONE , NAMEFLAG_NONE , Keywords_cb } ,
{ " Subject " , OBJ_DICT , STATE_NONE , STATE_NONE , NAMEFLAG_NONE , Subject_cb } ,
{ " Pages " , OBJ_DICT , STATE_NONE , STATE_NONE , NAMEFLAG_NONE , Pages_cb } ,
{ " Colors " , OBJ_DICT , STATE_NONE , STATE_NONE , NAMEFLAG_NONE , Colors_cb } ,
{ " RichMedia " , OBJ_DICT , STATE_NONE , STATE_NONE , NAMEFLAG_NONE , RichMedia_cb } ,
{ " AcroForm " , OBJ_DICT , STATE_NONE , STATE_NONE , NAMEFLAG_NONE , AcroForm_cb } ,
2025-04-07 16:50:09 -07:00
{ " XFA " , OBJ_DICT , STATE_NONE , STATE_NONE , NAMEFLAG_NONE , XFA_cb } ,
{ " URI " , OBJ_DICT , STATE_NONE , STATE_URI , NAMEFLAG_NONE , URI_cb } } ;
2010-05-10 23:41:34 +03:00
2010-07-30 17:13:46 +03:00
# define KNOWN_FILTERS ((1 << OBJ_FILTER_AH) | (1 << OBJ_FILTER_RL) | (1 << OBJ_FILTER_A85) | (1 << OBJ_FILTER_FLATE) | (1 << OBJ_FILTER_LZW) | (1 << OBJ_FILTER_FAX) | (1 << OBJ_FILTER_DCT) | (1 << OBJ_FILTER_JPX) | (1 << OBJ_FILTER_CRYPT))
2010-07-30 14:23:10 +03:00
2014-04-07 16:39:54 -04:00
static void handle_pdfname ( struct pdf_struct * pdf , struct pdf_obj * obj , const char * pdfname , int escapes , enum objstate * state )
2010-05-10 23:41:34 +03:00
{
struct pdfname_action * act = NULL ;
unsigned j ;
2014-04-07 16:39:54 -04:00
2025-04-07 16:50:09 -07:00
// If we process STATE_S we will get duplicate URIs from the prior STATE_NONE
if ( ! strcmp ( pdfname , " URI " ) & & * state = = STATE_S ) {
* state = STATE_NONE ;
return ;
}
2014-06-10 22:13:12 -04:00
obj - > statsflags | = OBJ_FLAG_PDFNAME_DONE ;
2025-04-07 16:50:09 -07:00
// Check to see if this object was observed to be a reference to a URI
if ( obj - > flags & ( 1 < < OBJ_URI ) ) {
act = & ( struct pdfname_action ) { " URI " , OBJ_DICT , STATE_ANY , STATE_URI , NAMEFLAG_NONE , URI_cb } ;
}
if ( ! act ) {
for ( j = 0 ; j < sizeof ( pdfname_actions ) / sizeof ( pdfname_actions [ 0 ] ) ; j + + ) {
if ( ! strcmp ( pdfname , pdfname_actions [ j ] . pdfname ) ) {
act = & pdfname_actions [ j ] ;
break ;
}
2014-04-07 16:39:54 -04:00
}
2010-05-10 23:41:34 +03:00
}
2014-04-07 16:39:54 -04:00
2010-07-30 14:23:10 +03:00
if ( ! act ) {
2014-04-07 16:39:54 -04:00
/* these are digital signature objects, filter doesn't matter,
* we don ' t need them anyway */
if ( * state = = STATE_FILTER & & ! ( obj - > flags & ( 1 < < OBJ_SIGNED ) ) & & ! ( obj - > flags & KNOWN_FILTERS ) ) {
2018-08-14 14:00:31 -07:00
cli_dbgmsg ( " handle_pdfname: unknown filter %s \n " , pdfname ) ;
2014-04-07 16:39:54 -04:00
obj - > flags | = 1 < < OBJ_FILTER_UNKNOWN ;
}
return ;
2010-07-30 14:23:10 +03:00
}
2014-04-07 16:39:54 -04:00
2016-03-28 13:16:17 -04:00
/* record filter order */
2017-10-30 17:33:19 -04:00
if ( obj - > numfilters < PDF_FILTERLIST_MAX & & ( * state = = STATE_FILTER ) & & ( ( 1 < < act - > set_objflag ) & KNOWN_FILTERS ) )
2016-03-28 13:16:17 -04:00
obj - > filterlist [ obj - > numfilters + + ] = act - > set_objflag ;
2014-07-30 14:20:45 -04:00
if ( ( act - > nameflags & NAMEFLAG_HEURISTIC ) & & escapes ) {
/* if a commonly used PDF name is escaped that is certainly
suspicious . */
2018-08-14 14:00:31 -07:00
cli_dbgmsg ( " handle_pdfname: pdfname %s is escaped \n " , pdfname ) ;
2014-07-30 14:20:45 -04:00
pdfobj_flag ( pdf , obj , ESCAPED_COMMON_PDFNAME ) ;
}
2014-04-16 14:23:16 -04:00
if ( ( act - > pdf_stats_cb ) )
act - > pdf_stats_cb ( pdf , obj , act ) ;
2014-04-07 16:39:54 -04:00
if ( act - > from_state = = * state | | act - > from_state = = STATE_ANY ) {
* state = act - > to_state ;
2017-10-30 17:33:19 -04:00
if ( * state = = STATE_FILTER & & act - > set_objflag ! = OBJ_DICT & & ( obj - > flags & ( 1 < < act - > set_objflag ) ) ) {
2018-08-14 14:00:31 -07:00
cli_dbgmsg ( " handle_pdfname: duplicate stream filter %s \n " , pdfname ) ;
2014-04-07 16:39:54 -04:00
pdfobj_flag ( pdf , obj , BAD_STREAM_FILTERS ) ;
}
obj - > flags | = 1 < < act - > set_objflag ;
} else {
/* auto-reset states */
switch ( * state ) {
2018-12-03 12:40:13 -05:00
case STATE_S :
* state = STATE_NONE ;
break ;
default :
break ;
2014-04-07 16:39:54 -04:00
}
2010-05-10 23:41:34 +03:00
}
}
2011-12-15 13:27:31 +02:00
static void pdf_parse_encrypt ( struct pdf_struct * pdf , const char * enc , int len )
{
const char * q , * q2 ;
2018-06-02 20:58:35 -04:00
unsigned long objid ;
unsigned long genid ;
2019-01-22 14:15:46 -05:00
long temp_long ;
2011-12-15 13:27:31 +02:00
if ( len > = 16 & & ! strncmp ( enc , " /EncryptMetadata " , 16 ) ) {
2018-12-03 12:40:13 -05:00
q = cli_memstr ( enc + 16 , len - 16 , " /Encrypt " , 8 ) ;
2014-04-07 16:39:54 -04:00
if ( ! q )
return ;
len - = q - enc ;
enc = q ;
2011-12-15 13:27:31 +02:00
}
2014-04-07 16:39:54 -04:00
2011-12-15 13:27:31 +02:00
q = enc + 8 ;
len - = 8 ;
q2 = pdf_nextobject ( q , len ) ;
if ( ! q2 | | ! isdigit ( * q2 ) )
2014-04-07 16:39:54 -04:00
return ;
2011-12-15 13:27:31 +02:00
len - = q2 - q ;
q = q2 ;
2018-06-01 14:23:25 -04:00
2019-01-22 14:15:46 -05:00
if ( CL_SUCCESS ! = cli_strntol_wrap ( q2 , ( size_t ) len , 0 , 10 , & temp_long ) ) {
2018-08-14 14:00:31 -07:00
cli_dbgmsg ( " pdf_parse_encrypt: Found Encrypt dictionary but failed to parse objid \n " ) ;
2018-06-02 20:58:35 -04:00
return ;
2019-01-22 14:15:46 -05:00
} else if ( temp_long < 0 ) {
cli_dbgmsg ( " pdf_parse_encrypt: Encountered invalid negative objid (%ld). \n " , temp_long ) ;
return ;
2018-06-02 20:58:35 -04:00
}
2019-01-22 14:15:46 -05:00
objid = ( unsigned long ) temp_long ;
2018-06-02 20:58:35 -04:00
objid = objid < < 8 ;
2018-12-03 12:40:13 -05:00
q2 = pdf_nextobject ( q , len ) ;
2011-12-15 13:27:31 +02:00
if ( ! q2 | | ! isdigit ( * q2 ) )
2014-04-07 16:39:54 -04:00
return ;
2011-12-15 13:27:31 +02:00
len - = q2 - q ;
q = q2 ;
2018-06-01 14:23:25 -04:00
2019-01-22 14:15:46 -05:00
if ( CL_SUCCESS ! = cli_strntol_wrap ( q2 , ( size_t ) len , 0 , 10 , & temp_long ) ) {
2018-08-14 14:00:31 -07:00
cli_dbgmsg ( " pdf_parse_encrypt: Found Encrypt dictionary but failed to parse genid \n " ) ;
2018-06-02 20:58:35 -04:00
return ;
2019-01-22 14:15:46 -05:00
} else if ( temp_long < 0 ) {
cli_dbgmsg ( " pdf_parse_encrypt: Encountered invalid negative genid (%ld). \n " , temp_long ) ;
return ;
2018-06-02 20:58:35 -04:00
}
2019-01-22 14:15:46 -05:00
genid = ( unsigned long ) temp_long ;
2018-12-03 12:40:13 -05:00
objid | = genid & 0xff ;
2011-12-15 13:27:31 +02:00
q2 = pdf_nextobject ( q , len ) ;
if ( ! q2 | | * q2 ! = ' R ' )
2014-04-07 16:39:54 -04:00
return ;
2018-12-03 12:40:13 -05:00
cli_dbgmsg ( " pdf_parse_encrypt: Encrypt dictionary in obj %lu %lu \n " , objid > > 8 , objid & 0xff ) ;
2014-04-07 16:39:54 -04:00
2011-12-15 13:27:31 +02:00
pdf - > enc_objid = objid ;
}
static void pdf_parse_trailer ( struct pdf_struct * pdf , const char * s , long length )
{
const char * enc ;
2014-04-07 16:39:54 -04:00
2011-12-15 13:27:31 +02:00
enc = cli_memstr ( s , length , " /Encrypt " , 8 ) ;
if ( enc ) {
2014-04-07 16:39:54 -04:00
char * newID ;
2020-06-24 10:59:48 +02:00
unsigned int newIDlen = 0 ;
2014-04-07 16:39:54 -04:00
pdf - > flags | = 1 < < ENCRYPTED_PDF ;
pdf_parse_encrypt ( pdf , enc , s + length - enc ) ;
2023-11-03 15:00:46 -04:00
newID = pdf_readstring ( s , length , " /ID " , & newIDlen , NULL , false ) ;
2014-04-07 16:39:54 -04:00
if ( newID ) {
free ( pdf - > fileID ) ;
2020-07-24 08:32:47 -07:00
pdf - > fileID = newID ;
2020-06-24 10:59:48 +02:00
pdf - > fileIDlen = newIDlen ;
2014-04-07 16:39:54 -04:00
}
2011-12-15 13:27:31 +02:00
}
}
2014-06-25 13:36:30 -04:00
void pdf_parseobj ( struct pdf_struct * pdf , struct pdf_obj * obj )
2010-05-10 23:41:34 +03:00
{
/* enough to hold common pdf names, we don't need all the names */
2025-04-07 16:50:09 -07:00
char pdfname [ 64 ] = { 0 } ;
2013-01-24 14:43:58 -05:00
const char * q2 , * q3 ;
2017-12-21 14:39:01 -05:00
const char * nextobj = NULL , * nextopen = NULL , * nextclose = NULL ;
2018-12-03 12:40:13 -05:00
const char * q = NULL ;
2018-08-14 14:00:31 -07:00
const char * dict = NULL , * enddict = NULL , * start = NULL ;
2019-03-05 21:15:41 -05:00
off_t dict_length = 0 , full_dict_length = 0 , bytesleft = 0 ;
2018-12-03 12:40:13 -05:00
size_t i = 0 ;
2018-08-14 14:00:31 -07:00
unsigned filters = 0 , blockopens = 0 ;
2010-05-10 23:41:34 +03:00
enum objstate objstate = STATE_NONE ;
2024-03-25 13:01:46 -04:00
2018-12-03 12:40:13 -05:00
json_object * pdfobj = NULL , * jsonobj = NULL ;
2010-05-10 23:41:34 +03:00
2019-01-22 14:15:46 -05:00
if ( NULL = = pdf | | NULL = = obj ) {
cli_warnmsg ( " pdf_parseobj: invalid arguments \n " ) ;
return ;
}
2019-03-05 21:15:41 -05:00
cli_dbgmsg ( " pdf_parseobj: Parsing object %u %u \n " , obj - > id > > 8 , obj - > id & 0xff ) ;
2019-01-22 13:53:29 -05:00
if ( obj - > objstm ) {
if ( ( size_t ) obj - > start > obj - > objstm - > streambuf_len ) {
cli_dbgmsg ( " pdf_parseobj: %u %u obj: obj start (%u) is greater than size of object stream (%zu). \n " ,
2019-01-22 18:04:53 -05:00
obj - > id > > 8 , obj - > id & 0xff , obj - > start , obj - > objstm - > streambuf_len ) ;
2019-01-22 13:53:29 -05:00
return ;
}
q = ( const char * ) ( obj - > start + obj - > objstm - > streambuf ) ;
} else {
if ( ( size_t ) obj - > start > pdf - > size ) {
cli_dbgmsg ( " pdf_parseobj: %u %u obj: obj start (%u) is greater than size of PDF (%lld). \n " ,
2019-01-22 18:04:53 -05:00
obj - > id > > 8 , obj - > id & 0xff , obj - > start , ( long long ) pdf - > size ) ;
2019-01-22 13:53:29 -05:00
return ;
}
q = ( const char * ) ( obj - > start + pdf - > map ) ;
}
start = q ;
2018-08-14 14:00:31 -07:00
2019-03-05 21:15:41 -05:00
if ( obj - > size < = 0 )
2014-04-07 16:39:54 -04:00
return ;
2019-01-22 13:53:29 -05:00
if ( obj - > objstm ) {
2019-03-05 21:15:41 -05:00
bytesleft = MIN ( obj - > size , obj - > objstm - > streambuf_len - obj - > start ) ;
2019-01-22 13:53:29 -05:00
} else {
2019-03-05 21:15:41 -05:00
bytesleft = MIN ( obj - > size , pdf - > size - obj - > start ) ;
}
/* For objects that aren't already in an object stream^, check if they contain a stream.
* ^ Objects in object streams aren ' t supposed to contain streams , so we don ' t check them . */
if ( NULL = = obj - > objstm ) {
/* Check if object contains stream */
cl_error_t has_stream ;
const char * stream = NULL ;
size_t stream_size = 0 ;
has_stream = find_stream_bounds (
start ,
obj - > size ,
& stream ,
& stream_size ,
( pdf - > enc_method_stream < = ENC_IDENTITY ) & & ( pdf - > enc_method_embeddedfile < = ENC_IDENTITY ) ) ;
if ( ( CL_SUCCESS = = has_stream ) | |
( CL_EFORMAT = = has_stream ) ) {
/* Stream found. Store this fact and the stream bounds. */
cli_dbgmsg ( " pdf_parseobj: %u %u contains stream, size: %zu \n " , obj - > id > > 8 , obj - > id & 0xff , stream_size ) ;
obj - > flags | = ( 1 < < OBJ_STREAM ) ;
obj - > stream = stream ;
obj - > stream_size = stream_size ;
}
2019-01-22 13:53:29 -05:00
}
2013-01-24 14:43:58 -05:00
2010-05-10 23:41:34 +03:00
/* find start of dictionary */
do {
2014-04-07 16:39:54 -04:00
nextobj = pdf_nextobject ( q , bytesleft ) ;
2018-12-03 12:40:13 -05:00
bytesleft - = nextobj - q ;
2014-04-07 16:39:54 -04:00
if ( ! nextobj | | bytesleft < 0 ) {
2018-12-03 12:40:13 -05:00
cli_dbgmsg ( " pdf_parseobj: %u %u obj: no dictionary \n " , obj - > id > > 8 , obj - > id & 0xff ) ;
2024-03-25 13:01:46 -04:00
2025-06-08 01:12:33 -04:00
if ( ! ( pdfobj ) & & pdf - > ctx - > this_layer_metadata_json ! = NULL ) {
pdfobj = cli_jsonobj ( pdf - > ctx - > this_layer_metadata_json , " PDFStats " ) ;
2014-06-30 15:43:53 -04:00
if ( ! ( pdfobj ) )
return ;
}
if ( pdfobj ) {
if ( ! ( jsonobj ) )
jsonobj = cli_jsonarray ( pdfobj , " ObjectsWithoutDictionaries " ) ;
if ( jsonobj )
2018-12-03 12:40:13 -05:00
cli_jsonint_array ( jsonobj , obj - > id > > 8 ) ;
2014-06-30 15:43:53 -04:00
}
2024-03-25 13:01:46 -04:00
2014-04-07 16:39:54 -04:00
return ;
}
2019-01-22 14:15:46 -05:00
/*
* Opening ` < ` for object ' s dictionary may be back 1 character ,
* provided q is not at the start of the buffer ( it shouldn ' t be ) .
*/
if ( obj - > objstm ) {
if ( obj - > objstm - > streambuf = = q ) {
q3 = memchr ( q , ' < ' , nextobj - q ) ;
} else {
q3 = memchr ( q - 1 , ' < ' , nextobj - q + 1 ) ;
}
} else {
if ( pdf - > map = = q ) {
q3 = memchr ( q , ' < ' , nextobj - q ) ;
} else {
q3 = memchr ( q - 1 , ' < ' , nextobj - q + 1 ) ;
}
}
2014-04-07 16:39:54 -04:00
nextobj + + ;
bytesleft - - ;
q = nextobj ;
2010-05-10 23:41:34 +03:00
} while ( ! q3 | | q3 [ 1 ] ! = ' < ' ) ;
2018-12-03 12:40:13 -05:00
dict = q3 + 2 ;
q = dict ;
2013-01-24 14:43:58 -05:00
blockopens + + ;
2019-03-05 21:15:41 -05:00
bytesleft = obj - > size - ( q - start ) ;
2018-12-03 12:40:13 -05:00
enddict = q + bytesleft - 1 ;
2013-01-24 14:43:58 -05:00
/* find end of dictionary block */
2013-02-12 10:24:01 -05:00
if ( bytesleft < 0 ) {
2018-12-03 12:40:13 -05:00
cli_dbgmsg ( " pdf_parseobj: %u %u obj: broken dictionary \n " , obj - > id > > 8 , obj - > id & 0xff ) ;
2024-03-25 13:01:46 -04:00
2025-06-08 01:12:33 -04:00
if ( ! ( pdfobj ) & & pdf - > ctx - > this_layer_metadata_json ! = NULL ) {
pdfobj = cli_jsonobj ( pdf - > ctx - > this_layer_metadata_json , " PDFStats " ) ;
2014-06-30 15:43:53 -04:00
if ( ! ( pdfobj ) )
return ;
}
if ( pdfobj ) {
if ( ! ( jsonobj ) )
jsonobj = cli_jsonarray ( pdfobj , " ObjectsWithBrokenDictionaries " ) ;
if ( jsonobj )
2018-12-03 12:40:13 -05:00
cli_jsonint_array ( jsonobj , obj - > id > > 8 ) ;
2014-06-30 15:43:53 -04:00
}
2024-03-25 13:01:46 -04:00
2013-02-12 10:24:01 -05:00
return ;
}
2013-01-24 14:43:58 -05:00
2013-02-12 10:24:01 -05:00
/* while still looking ... */
2018-12-03 12:40:13 -05:00
while ( ( q < enddict - 1 ) & & ( blockopens > 0 ) ) {
2013-02-12 10:24:01 -05:00
/* find next close */
2018-12-03 12:40:13 -05:00
nextclose = memchr ( q , ' > ' , enddict - q ) ;
2013-02-12 10:24:01 -05:00
if ( nextclose & & ( nextclose [ 1 ] = = ' > ' ) ) {
/* check for nested open */
2018-12-03 12:40:13 -05:00
while ( ( nextopen = memchr ( q - 1 , ' < ' , nextclose - q + 1 ) ) ! = NULL ) {
2013-02-12 10:24:01 -05:00
if ( nextopen [ 1 ] = = ' < ' ) {
/* nested open */
blockopens + + ;
q = nextopen + 2 ;
2018-12-03 12:40:13 -05:00
} else {
2013-02-12 10:24:01 -05:00
/* unmatched < before next close */
q = nextopen + 2 ;
2013-01-24 14:43:58 -05:00
}
}
2013-02-12 10:24:01 -05:00
/* close block */
blockopens - - ;
q = nextclose + 2 ;
2018-12-03 12:40:13 -05:00
} else if ( nextclose ) {
2013-02-12 10:24:01 -05:00
/* found one > but not two */
q = nextclose + 2 ;
2018-12-03 12:40:13 -05:00
} else {
2013-02-12 10:24:01 -05:00
/* next closing not found */
2013-03-15 11:29:25 -04:00
break ;
2013-02-12 10:24:01 -05:00
}
}
2013-01-24 14:43:58 -05:00
2013-02-12 10:24:01 -05:00
/* Was end of dictionary found? */
2013-03-15 11:29:25 -04:00
if ( blockopens ) {
/* probably truncated */
2018-12-03 12:40:13 -05:00
cli_dbgmsg ( " pdf_parseobj: %u %u obj broken dictionary \n " , obj - > id > > 8 , obj - > id & 0xff ) ;
2024-03-25 13:01:46 -04:00
2025-06-08 01:12:33 -04:00
if ( ! ( pdfobj ) & & pdf - > ctx - > this_layer_metadata_json ! = NULL ) {
pdfobj = cli_jsonobj ( pdf - > ctx - > this_layer_metadata_json , " PDFStats " ) ;
2014-06-30 15:43:53 -04:00
if ( ! ( pdfobj ) )
return ;
}
if ( pdfobj ) {
if ( ! ( jsonobj ) )
jsonobj = cli_jsonarray ( pdfobj , " ObjectsWithBrokenDictionaries " ) ;
if ( jsonobj )
2018-12-03 12:40:13 -05:00
cli_jsonint_array ( jsonobj , obj - > id > > 8 ) ;
2014-06-30 15:43:53 -04:00
}
2024-03-25 13:01:46 -04:00
2013-02-12 10:24:01 -05:00
return ;
2013-03-15 11:29:25 -04:00
}
2014-04-07 16:39:54 -04:00
2013-01-24 14:43:58 -05:00
enddict = nextclose ;
2010-05-10 23:41:34 +03:00
obj - > flags | = 1 < < OBJ_DICT ;
2013-01-24 14:43:58 -05:00
full_dict_length = dict_length = enddict - dict ;
/* This code prints the dictionary content.
{
char * dictionary = malloc ( dict_length + 1 ) ;
if ( dictionary ) {
2013-02-12 10:24:01 -05:00
for ( i = 0 ; i < dict_length ; i + + ) {
2016-04-01 15:20:36 -04:00
if ( dict [ i ] = = ' \r ' )
dictionary [ i ] = ' \n ' ;
else if ( isprint ( dict [ i ] ) | | isspace ( dict [ i ] ) )
2013-02-12 10:24:01 -05:00
dictionary [ i ] = dict [ i ] ;
else
dictionary [ i ] = ' * ' ;
}
2013-01-24 14:43:58 -05:00
dictionary [ dict_length ] = ' \0 ' ;
2018-08-14 14:00:31 -07:00
cli_dbgmsg ( " pdf_parseobj: dictionary is <<%s>> \n " , dictionary ) ;
2013-01-24 14:43:58 -05:00
free ( dictionary ) ;
}
}
*/
2010-05-10 23:41:34 +03:00
2010-05-11 13:23:20 +03:00
/* process pdf names */
2018-12-03 12:40:13 -05:00
for ( q = dict ; dict_length > 0 ; ) {
int escapes = 0 , breakout = 0 ;
2014-04-07 16:39:54 -04:00
q2 = memchr ( q , ' / ' , dict_length ) ;
if ( ! q2 )
break ;
dict_length - = q2 - q ;
q = q2 ;
/* normalize PDF names */
2018-12-03 12:40:13 -05:00
for ( i = 0 ; dict_length > 0 & & ( i < sizeof ( pdfname ) - 1 ) ; i + + ) {
2014-04-07 16:39:54 -04:00
q + + ;
dict_length - - ;
if ( * q = = ' # ' ) {
2018-12-03 12:40:13 -05:00
if ( cli_hex2str_to ( q + 1 , pdfname + i , 2 ) = = - 1 )
2014-04-07 16:39:54 -04:00
break ;
q + = 2 ;
dict_length - = 2 ;
escapes = 1 ;
continue ;
}
switch ( * q ) {
2018-12-03 12:40:13 -05:00
case ' ' :
case ' \t ' :
case ' \r ' :
case ' \n ' :
case ' / ' :
case ' > ' :
case ' [ ' :
case ' ] ' :
case ' < ' :
case ' ( ' :
breakout = 1 ;
2014-04-07 16:39:54 -04:00
}
if ( breakout )
break ;
pdfname [ i ] = * q ;
}
pdfname [ i ] = ' \0 ' ;
handle_pdfname ( pdf , obj , pdfname , escapes , & objstate ) ;
if ( objstate = = STATE_LINEARIZED ) {
long trailer_end , trailer ;
pdfobj_flag ( pdf , obj , LINEARIZED_PDF ) ;
2018-12-03 12:40:13 -05:00
objstate = STATE_NONE ;
2014-04-07 16:39:54 -04:00
trailer_end = pdf_readint ( dict , full_dict_length , " /H " ) ;
2019-03-05 21:15:41 -05:00
if ( ( trailer_end > 0 ) & & ( ( size_t ) trailer_end < pdf - > size ) ) {
2014-04-07 16:39:54 -04:00
trailer = trailer_end - 1024 ;
if ( trailer < 0 )
trailer = 0 ;
q2 = pdf - > map + trailer ;
2018-08-14 14:00:31 -07:00
cli_dbgmsg ( " pdf_parseobj: looking for trailer in linearized pdf: %ld - %ld \n " , trailer , trailer_end ) ;
2014-04-07 16:39:54 -04:00
pdf_parse_trailer ( pdf , q2 , trailer_end - trailer ) ;
if ( pdf - > fileID )
2018-08-14 14:00:31 -07:00
cli_dbgmsg ( " pdf_parseobj: found fileID \n " ) ;
2014-04-07 16:39:54 -04:00
}
}
if ( objstate = = STATE_LAUNCHACTION )
pdfobj_flag ( pdf , obj , HAS_LAUNCHACTION ) ;
2025-04-07 16:50:09 -07:00
if ( dict_length > 0 & & ( objstate = = STATE_JAVASCRIPT | |
objstate = = STATE_OPENACTION | |
objstate = = STATE_CONTENTS | |
objstate = = STATE_URI ) ) {
2018-03-08 12:21:16 -05:00
off_t dict_remaining = dict_length ;
2014-04-07 16:39:54 -04:00
if ( objstate = = STATE_OPENACTION )
pdfobj_flag ( pdf , obj , HAS_OPENACTION ) ;
2018-03-08 12:21:16 -05:00
q2 = pdf_nextobject ( q , dict_remaining ) ;
2014-04-07 16:39:54 -04:00
if ( q2 & & isdigit ( * q2 ) ) {
2018-12-03 12:40:13 -05:00
const char * q2_old = NULL ;
2018-06-02 20:58:35 -04:00
unsigned long objid ;
unsigned long genid ;
2019-01-22 14:15:46 -05:00
long temp_long ;
2018-06-02 20:58:35 -04:00
2018-03-08 12:21:16 -05:00
dict_remaining - = ( off_t ) ( q2 - q ) ;
2019-01-22 14:15:46 -05:00
if ( CL_SUCCESS ! = cli_strntol_wrap ( q2 , ( size_t ) dict_remaining , 0 , 10 , & temp_long ) ) {
2018-08-14 14:00:31 -07:00
cli_dbgmsg ( " pdf_parseobj: failed to parse object objid \n " ) ;
2018-06-02 20:58:35 -04:00
return ;
2019-01-22 14:15:46 -05:00
} else if ( temp_long < 0 ) {
cli_dbgmsg ( " pdf_parseobj: Encountered invalid negative genid (%ld). \n " , temp_long ) ;
return ;
2018-06-02 20:58:35 -04:00
}
2019-01-22 14:15:46 -05:00
objid = ( unsigned long ) temp_long ;
2018-06-02 20:58:35 -04:00
objid = objid < < 8 ;
2019-01-22 18:04:53 -05:00
while ( ( dict_remaining > 0 ) & & isdigit ( * q2 ) ) {
2014-04-07 16:39:54 -04:00
q2 + + ;
2019-01-22 18:04:53 -05:00
dict_remaining - - ;
}
2014-04-07 16:39:54 -04:00
2018-03-08 12:21:16 -05:00
q2_old = q2 ;
2018-12-03 12:40:13 -05:00
q2 = pdf_nextobject ( q2 , dict_remaining ) ;
2014-04-07 16:39:54 -04:00
if ( q2 & & isdigit ( * q2 ) ) {
2018-03-08 12:21:16 -05:00
dict_remaining - = ( off_t ) ( q2 - q2_old ) ;
2019-01-22 14:15:46 -05:00
if ( CL_SUCCESS ! = cli_strntol_wrap ( q2 , ( size_t ) dict_remaining , 0 , 10 , & temp_long ) ) {
2018-08-14 14:00:31 -07:00
cli_dbgmsg ( " pdf_parseobj: failed to parse object genid \n " ) ;
2018-06-02 20:58:35 -04:00
return ;
2019-01-22 14:15:46 -05:00
} else if ( temp_long < 0 ) {
cli_dbgmsg ( " pdf_parseobj: Encountered invalid negative genid (%ld). \n " , temp_long ) ;
return ;
2018-06-02 20:58:35 -04:00
}
2019-01-22 14:15:46 -05:00
genid = ( unsigned long ) temp_long ;
2018-06-02 20:58:35 -04:00
objid | = genid & 0xff ;
2014-04-07 16:39:54 -04:00
2018-03-08 12:21:16 -05:00
q2 = pdf_nextobject ( q2 , dict_remaining ) ;
2014-04-07 16:39:54 -04:00
if ( q2 & & * q2 = = ' R ' ) {
struct pdf_obj * obj2 ;
2018-12-03 12:40:13 -05:00
cli_dbgmsg ( " pdf_parseobj: found %s stored in indirect object %lu %lu \n " , pdfname , objid > > 8 , objid & 0xff ) ;
2014-04-07 16:39:54 -04:00
obj2 = find_obj ( pdf , obj , objid ) ;
if ( obj2 ) {
2020-07-01 16:56:26 -07:00
enum pdf_objflags flag = OBJ_STREAM ;
switch ( objstate ) {
case STATE_JAVASCRIPT :
flag = OBJ_JAVASCRIPT ;
break ;
case STATE_OPENACTION :
flag = OBJ_OPENACTION ;
break ;
case STATE_CONTENTS :
flag = OBJ_CONTENTS ;
break ;
2025-04-07 16:50:09 -07:00
case STATE_URI :
flag = OBJ_URI ;
break ;
2020-07-01 16:56:26 -07:00
default :
cli_dbgmsg ( " pdf_parseobj: Unexpected object type \n " ) ;
return ;
}
2014-04-07 16:39:54 -04:00
2020-07-01 16:56:26 -07:00
obj - > flags & = ~ ( 1 < < flag ) ; /* Disable flag for current object ... */
obj2 - > flags | = 1 < < flag ; /* ... and set the flag for the indirect object instead! */
2014-04-07 16:39:54 -04:00
} else {
pdfobj_flag ( pdf , obj , BAD_INDOBJ ) ;
}
}
}
}
objstate = STATE_NONE ;
}
2010-05-10 23:41:34 +03:00
}
2014-04-07 16:39:54 -04:00
2018-12-03 12:40:13 -05:00
for ( i = 0 ; i < sizeof ( pdfname_actions ) / sizeof ( pdfname_actions [ 0 ] ) ; i + + ) {
2014-04-07 16:39:54 -04:00
const struct pdfname_action * act = & pdfname_actions [ i ] ;
if ( ( obj - > flags & ( 1 < < act - > set_objflag ) ) & &
act - > from_state = = STATE_FILTER & &
act - > to_state = = STATE_FILTER & &
act - > set_objflag ! = OBJ_FILTER_CRYPT & &
act - > set_objflag ! = OBJ_FILTER_STANDARD ) {
filters + + ;
}
2010-08-01 22:14:44 +03:00
}
2014-04-07 16:39:54 -04:00
if ( filters > 2 ) {
/* more than 2 non-crypt filters */
pdfobj_flag ( pdf , obj , MANY_FILTERS ) ;
2010-08-01 22:14:44 +03:00
}
2014-04-07 16:39:54 -04:00
2010-07-30 20:26:59 +03:00
if ( obj - > flags & ( ( 1 < < OBJ_SIGNED ) | KNOWN_FILTERS ) )
2014-04-07 16:39:54 -04:00
obj - > flags & = ~ ( 1 < < OBJ_FILTER_UNKNOWN ) ;
2010-07-30 20:26:59 +03:00
if ( obj - > flags & ( 1 < < OBJ_FILTER_UNKNOWN ) )
2014-04-07 16:39:54 -04:00
pdfobj_flag ( pdf , obj , UNKNOWN_FILTER ) ;
2018-12-03 12:40:13 -05:00
cli_dbgmsg ( " pdf_parseobj: %u %u obj flags: %02x \n " , obj - > id > > 8 , obj - > id & 0xff , obj - > flags ) ;
2010-05-10 23:41:34 +03:00
}
2018-06-08 10:13:13 -07:00
/**
* @ brief Given a pointer to a dictionary object and a key , get the key ' s value .
*
* @ param q0 Offset of the start of the dictionary .
* @ param [ in , out ] len In : The number of bytes in the dictionary .
* Out : The number of bytes remaining from the start
* of the value to the end of the dict
* @ param key Null terminated ' key ' to search for .
* @ return const char * Address of the dictionary key ' s ' value ' .
*/
2018-12-03 12:40:13 -05:00
static const char * pdf_getdict ( const char * q0 , int * len , const char * key )
2011-05-07 18:06:06 +03:00
{
const char * q ;
2011-10-12 10:18:44 +03:00
if ( * len < = 0 ) {
2018-08-14 14:00:31 -07:00
cli_dbgmsg ( " pdf_getdict: bad length %d \n " , * len ) ;
2013-02-12 10:24:01 -05:00
return NULL ;
}
2014-04-07 16:39:54 -04:00
if ( ! q0 )
return NULL ;
2018-06-08 10:13:13 -07:00
/* find the key */
2011-05-07 18:06:06 +03:00
q = cli_memstr ( q0 , * len , key , strlen ( key ) ) ;
if ( ! q ) {
2018-08-14 14:00:31 -07:00
cli_dbgmsg ( " pdf_getdict: %s not found in dict \n " , key ) ;
2014-04-07 16:39:54 -04:00
return NULL ;
2011-05-07 18:06:06 +03:00
}
2014-04-07 16:39:54 -04:00
2011-05-07 18:06:06 +03:00
* len - = q - q0 ;
q0 = q ;
2018-06-08 10:13:13 -07:00
/* find the start of the value object */
2011-05-07 18:06:06 +03:00
q = pdf_nextobject ( q0 + 1 , * len - 1 ) ;
if ( ! q ) {
2018-08-14 14:00:31 -07:00
cli_dbgmsg ( " pdf_getdict: %s is invalid in dict \n " , key ) ;
2014-04-07 16:39:54 -04:00
return NULL ;
2011-05-07 18:06:06 +03:00
}
2014-04-07 16:39:54 -04:00
2018-06-08 10:13:13 -07:00
/* if the value is a dictionary object, include the < > brackets.*/
2019-03-20 13:29:32 +01:00
while ( q > q0 & & ( q [ - 1 ] = = ' < ' | | q [ - 1 ] = = ' \n ' ) )
2019-06-18 15:09:21 -04:00
q - - ;
2014-04-07 16:39:54 -04:00
2011-05-07 18:06:06 +03:00
* len - = q - q0 ;
return q ;
}
2023-11-03 15:00:46 -04:00
/**
* @ brief Read the value string from a PDF dictionary key / value pair .
*
* @ param q0 A pointer into the PDF dictionary .
* @ param len The bytes remaining in the file .
* @ param key The key we ' re looking for .
* @ param [ out ] slen The length of the output string
* @ param [ out ] qend The pointer we wound up at , after the end of the value .
* @ param noescape Select ' true ' to ignore escape characters , ' false ' to process them .
* @ return char *
*/
static char * pdf_readstring ( const char * q0 , int len , const char * key , unsigned * slen , const char * * qend , bool noescape )
2011-05-07 18:06:06 +03:00
{
char * s , * s0 ;
const char * start , * q , * end ;
if ( slen )
2014-04-07 16:39:54 -04:00
* slen = 0 ;
2012-01-18 20:58:38 +02:00
if ( qend )
* qend = q0 ;
2014-04-07 16:39:54 -04:00
2011-05-07 18:06:06 +03:00
q = pdf_getdict ( q0 , & len , key ) ;
2018-06-08 10:13:13 -07:00
if ( ! q | | len < = 0 )
2014-04-07 16:39:54 -04:00
return NULL ;
2011-05-07 18:06:06 +03:00
if ( * q = = ' ( ' ) {
2014-04-07 16:39:54 -04:00
int paren = 1 ;
2018-12-03 12:40:13 -05:00
start = + + q ;
2018-06-08 10:13:13 -07:00
len - - ;
2018-12-03 12:40:13 -05:00
for ( ; paren > 0 & & len > 0 ; q + + , len - - ) {
2014-04-07 16:39:54 -04:00
switch ( * q ) {
2018-12-03 12:40:13 -05:00
case ' ( ' :
paren + + ;
break ;
case ' ) ' :
paren - - ;
break ;
case ' \\ ' :
q + + ;
len - - ;
break ;
default :
break ;
2014-04-07 16:39:54 -04:00
}
}
2018-06-08 10:13:13 -07:00
if ( len < = 0 ) {
cli_errmsg ( " pdf_readstring: Invalid, truncated dictionary. \n " ) ;
return NULL ;
}
2012-01-18 20:58:38 +02:00
if ( qend )
* qend = q ;
2014-04-07 16:39:54 -04:00
q - - ;
2018-12-03 12:40:13 -05:00
len = q - start ;
2022-05-09 14:28:34 -07:00
s0 = s = cli_max_malloc ( len + 1 ) ;
2014-04-07 16:39:54 -04:00
if ( ! s ) {
cli_errmsg ( " pdf_readstring: Unable to allocate buffer \n " ) ;
return NULL ;
}
end = start + len ;
2012-01-18 20:58:38 +02:00
if ( noescape ) {
memcpy ( s0 , start , len ) ;
s = s0 + len ;
} else {
2018-12-03 12:40:13 -05:00
for ( q = start ; q < end ; q + + ) {
2014-04-07 16:39:54 -04:00
if ( * q ! = ' \\ ' ) {
* s + + = * q ;
} else {
q + + ;
switch ( * q ) {
2018-12-03 12:40:13 -05:00
case ' n ' :
* s + + = ' \n ' ;
break ;
case ' r ' :
* s + + = ' \r ' ;
break ;
case ' t ' :
* s + + = ' \t ' ;
break ;
case ' b ' :
* s + + = ' \b ' ;
break ;
case ' f ' :
* s + + = ' \f ' ;
break ;
case ' ( ' : /* fall-through */
case ' ) ' : /* fall-through */
case ' \\ ' :
* s + + = * q ;
break ;
case ' \n ' :
/* ignore */
break ;
case ' \r ' :
/* ignore */
if ( q + 1 < end & & q [ 1 ] = = ' \n ' )
q + + ;
break ;
case ' 0 ' :
case ' 1 ' :
case ' 2 ' :
case ' 3 ' :
case ' 4 ' :
case ' 5 ' :
case ' 6 ' :
case ' 7 ' :
case ' 8 ' :
case ' 9 ' :
/* octal escape */
2018-12-21 14:32:54 +01:00
if ( q + 2 < end ) {
2019-06-18 15:09:21 -04:00
* s + + = 64 * ( q [ 0 ] - ' 0 ' ) + 8 * ( q [ 1 ] - ' 0 ' ) + ( q [ 2 ] - ' 0 ' ) ;
q + = 2 ;
2018-12-21 14:32:54 +01:00
}
2018-12-03 12:40:13 -05:00
break ;
default :
/* ignore */
* s + + = ' \\ ' ;
q - - ;
break ;
2014-04-07 16:39:54 -04:00
}
}
}
2012-01-18 20:58:38 +02:00
}
2014-04-07 16:39:54 -04:00
* s + + = ' \0 ' ;
if ( slen )
* slen = s - s0 - 1 ;
return s0 ;
2011-05-07 18:06:06 +03:00
}
2014-04-07 16:39:54 -04:00
2018-12-03 12:40:13 -05:00
if ( ( * q = = ' < ' ) & & ( len > = 3 ) ) {
2014-04-07 16:39:54 -04:00
start = + + q ;
2018-07-20 22:28:48 -04:00
len - = 1 ;
2019-03-20 13:29:32 +01:00
// skip newlines after <
while ( len > 0 & & * start = = ' \n ' ) {
2019-06-18 15:09:21 -04:00
start = + + q ;
len - = 1 ;
2019-03-20 13:29:32 +01:00
}
2018-12-03 12:40:13 -05:00
q = memchr ( q + 1 , ' > ' , len - 1 ) ;
2014-04-07 16:39:54 -04:00
if ( ! q )
return NULL ;
2012-01-18 20:58:38 +02:00
if ( qend )
* qend = q ;
2014-04-07 16:39:54 -04:00
2022-05-09 14:28:34 -07:00
s = cli_max_malloc ( ( q - start ) / 2 + 1 ) ;
2014-04-07 16:39:54 -04:00
if ( s = = NULL ) { /* oops, couldn't allocate memory */
2018-12-03 12:40:13 -05:00
cli_dbgmsg ( " pdf_readstring: unable to allocate memory... \n " ) ;
return NULL ;
2014-04-07 16:39:54 -04:00
}
if ( cli_hex2str_to ( start , s , q - start ) ) {
2018-08-14 14:00:31 -07:00
cli_dbgmsg ( " pdf_readstring: %s has bad hex value \n " , key ) ;
2014-04-07 16:39:54 -04:00
free ( s ) ;
return NULL ;
}
2018-12-03 12:40:13 -05:00
s [ ( q - start ) / 2 ] = ' \0 ' ;
2014-04-07 16:39:54 -04:00
if ( slen )
2018-12-03 12:40:13 -05:00
* slen = ( q - start ) / 2 ;
2014-04-07 16:39:54 -04:00
return s ;
2011-05-07 18:06:06 +03:00
}
2014-04-07 16:39:54 -04:00
2018-08-14 14:00:31 -07:00
cli_dbgmsg ( " pdf_readstring: %s is invalid string in dict \n " , key ) ;
2011-05-07 18:06:06 +03:00
return NULL ;
}
2011-12-14 15:43:14 +02:00
static char * pdf_readval ( const char * q , int len , const char * key )
{
const char * end ;
char * s ;
2016-04-14 11:16:43 -04:00
int origlen = len ;
2011-12-14 15:43:14 +02:00
q = pdf_getdict ( q , & len , key ) ;
if ( ! q | | len < = 0 )
2014-04-07 16:39:54 -04:00
return NULL ;
while ( len > 0 & & * q & & * q = = ' ' ) {
q + + ;
len - - ;
}
2011-12-14 15:43:14 +02:00
if ( * q ! = ' / ' )
2014-04-07 16:39:54 -04:00
return NULL ;
2011-12-14 15:43:14 +02:00
q + + ;
len - - ;
end = q ;
2014-04-07 16:39:54 -04:00
2011-12-14 15:43:14 +02:00
while ( len > 0 & & * end & & ! ( * end = = ' / ' | | ( len > 1 & & end [ 0 ] = = ' > ' & & end [ 1 ] = = ' > ' ) ) ) {
2014-04-07 16:39:54 -04:00
end + + ;
len - - ;
2011-12-14 15:43:14 +02:00
}
2014-04-07 16:39:54 -04:00
2016-04-14 11:16:43 -04:00
/* end-of-buffer whitespace trimming */
2018-12-03 12:40:13 -05:00
while ( len < origlen & & isspace ( * ( end - 1 ) ) ) {
2016-04-14 11:16:43 -04:00
end - - ;
len + + ;
}
2022-05-09 14:28:34 -07:00
s = cli_max_malloc ( end - q + 1 ) ;
2011-12-14 15:43:14 +02:00
if ( ! s )
2014-04-07 16:39:54 -04:00
return NULL ;
2018-12-03 12:40:13 -05:00
memcpy ( s , q , end - q ) ;
s [ end - q ] = ' \0 ' ;
2014-04-07 16:39:54 -04:00
2011-12-14 15:43:14 +02:00
return s ;
}
2011-05-07 18:06:06 +03:00
static int pdf_readint ( const char * q0 , int len , const char * key )
{
2018-12-03 12:40:13 -05:00
long value = 0 ;
const char * q = pdf_getdict ( q0 , & len , key ) ;
2014-04-07 16:39:54 -04:00
2018-06-02 20:58:35 -04:00
if ( q = = NULL ) {
value = - 1 ;
2018-12-03 12:40:13 -05:00
} else if ( CL_SUCCESS ! = cli_strntol_wrap ( q , ( size_t ) len , 0 , 10 , & value ) ) {
2018-06-02 20:58:35 -04:00
value = - 1 ;
}
return value ;
2011-05-07 18:06:06 +03:00
}
static int pdf_readbool ( const char * q0 , int len , const char * key , int Default )
{
2018-12-03 12:40:13 -05:00
const char * q = pdf_getdict ( q0 , & len , key ) ;
2014-04-07 16:39:54 -04:00
2011-05-07 18:06:06 +03:00
if ( ! q | | len < 5 )
2014-04-07 16:39:54 -04:00
return Default ;
2011-05-07 18:06:06 +03:00
if ( ! strncmp ( q , " true " , 4 ) )
2014-04-07 16:39:54 -04:00
return 1 ;
2011-05-07 18:06:06 +03:00
if ( ! strncmp ( q , " false " , 5 ) )
2014-04-07 16:39:54 -04:00
return 0 ;
2018-08-14 14:00:31 -07:00
cli_dbgmsg ( " pdf_readbool: invalid value for %s bool \n " , key ) ;
2014-04-07 16:39:54 -04:00
2011-05-07 18:06:06 +03:00
return Default ;
}
static const char * key_padding =
2018-12-03 12:40:13 -05:00
" \x28 \xBF \x4E \x5E \x4E \x75 \x8A \x41 \x64 \x00 \x4e \x56 \xff \xfa \x01 \x08 "
" \x2e \x2e \x00 \xB6 \xD0 \x68 \x3E \x80 \x2F \x0C \xA9 \xFE \x64 \x53 \x69 \x7A " ;
2011-05-07 18:06:06 +03:00
static void dbg_printhex ( const char * msg , const char * hex , unsigned len )
{
if ( cli_debug_flag ) {
2014-04-07 16:39:54 -04:00
char * kh = cli_str2hex ( hex , len ) ;
cli_dbgmsg ( " cli_pdf: %s: %s \n " , msg , kh ) ;
free ( kh ) ;
2011-05-07 18:06:06 +03:00
}
}
2024-01-14 19:22:00 -05:00
/**
* @ brief Compute the hash of the password concatenated with the validation salt and ( for owner - password checks ) the U string .
*
* Some details and comments for how to compute this hash comes from the PyPDF project :
* https : //github.com/py-pdf/pypdf/blob/3.17.4/pypdf/_encryption.py#L568
*
* @ param password The password to hash .
* @ param pwlen The length of the password .
* @ param salt The validation salt .
* @ param hash The resulting hash .
* @ param U [ Optional ] The U string ( for owner - password checks ) .
*/
static void compute_hash_r6 ( const char * password , size_t pwlen , const unsigned char salt [ 16 ] , unsigned char hash [ 32 ] , const char * U )
2016-12-17 12:47:54 +01:00
{
unsigned char data [ ( 128 + 64 + 48 ) * 64 ] ;
unsigned char block [ 64 ] ;
int32_t block_size = 32 ;
size_t in_data_len = 0 , out_data_len ;
int32_t i , j , sum ;
2025-06-03 19:03:20 -04:00
uint8_t sha2_256 [ 32 ] , sha2_384 [ 48 ] , sha2_512 [ 64 ] ;
2016-12-17 12:47:54 +01:00
2024-01-14 19:22:00 -05:00
/*
* Compute a SHA - 256 hash of the UTF - 8 password concatenated with the 8 bytes of the owner or user validation salt .
*/
2016-12-17 12:47:54 +01:00
memcpy ( data , password , pwlen ) ;
memcpy ( data + pwlen , salt , 8 ) ;
2024-01-14 19:22:00 -05:00
if ( NULL ! = U ) {
// If it's for the owner password check, we also concatenate the 48-byte U string.
memcpy ( data + pwlen + 8 , U , 48 ) ;
cl_sha256 ( data , pwlen + 8 + 48 , block , NULL ) ;
} else {
cl_sha256 ( data , pwlen + 8 , block , NULL ) ;
}
2016-12-17 12:47:54 +01:00
2019-08-23 12:57:18 -04:00
for ( i = 0 ; i < 64 | | i < ( data [ ( in_data_len * 64 ) - 1 ] + 32 ) ; i + + ) {
2016-12-17 12:47:54 +01:00
memcpy ( data , password , pwlen ) ;
memcpy ( data + pwlen , block , block_size ) ;
2024-01-14 19:22:00 -05:00
2016-12-17 12:47:54 +01:00
in_data_len = pwlen + block_size ;
2024-01-14 19:22:00 -05:00
if ( NULL ! = U ) {
// If it's for the owner password check, we also concatenate the 48-byte U string.
memcpy ( data + pwlen + block_size , U , 48 ) ;
in_data_len + = 48 ;
}
2016-12-17 12:47:54 +01:00
for ( j = 1 ; j < 64 ; j + + )
memcpy ( data + j * in_data_len , data , in_data_len ) ;
aes_128cbc_encrypt ( data , in_data_len * 64 , data , & out_data_len , block , 16 , block + 16 ) ;
for ( j = 0 , sum = 0 ; j < 16 ; j + + )
sum + = data [ j ] ;
block_size = 32 + ( sum % 3 ) * 16 ;
2019-08-23 12:57:18 -04:00
switch ( block_size ) {
2016-12-17 12:47:54 +01:00
case 32 :
2025-06-03 19:03:20 -04:00
cl_sha256 ( data , in_data_len * 64 , sha2_256 , NULL ) ;
memcpy ( block , sha2_256 , 32 ) ;
2016-12-17 12:47:54 +01:00
break ;
case 48 :
2025-06-03 19:03:20 -04:00
cl_sha384 ( data , in_data_len * 64 , sha2_384 , NULL ) ;
memcpy ( block , sha2_384 , 48 ) ;
2016-12-17 12:47:54 +01:00
break ;
case 64 :
2025-06-03 19:03:20 -04:00
cl_sha512 ( data , in_data_len * 64 , sha2_512 , NULL ) ;
memcpy ( block , sha2_512 , 64 ) ;
2016-12-17 12:47:54 +01:00
break ;
}
}
memcpy ( hash , block , 32 ) ;
}
2024-01-14 19:22:00 -05:00
/**
* @ brief Check if the owner password matches an empty password .
*
* Will set the DECRYPTABLE_PDF flag if the owner password is empty .
* Will also set the key and keylen fields in the pdf_struct .
*
* Some details and comments for how to check the owner password comes from the PyPDF project :
* https : //github.com/py-pdf/pypdf/blob/3.17.4/pypdf/_encryption.py#L397
*
* @ param pdf The PDF context .
* @ param R The encryption version .
* @ param O The / O string .
* @ param U The / U string .
* @ param OE The / OE string .
* @ param OE_len The length of the / OE string .
*/
static void check_owner_password ( struct pdf_struct * pdf , int R ,
const char * O , const char * U ,
const char * OE , size_t OE_len )
2011-05-07 18:06:06 +03:00
{
2024-01-14 19:22:00 -05:00
bool password_empty = false ;
2014-07-10 18:11:49 -04:00
2011-05-07 18:06:06 +03:00
dbg_printhex ( " U: " , U , 32 ) ;
dbg_printhex ( " O: " , O , 32 ) ;
2023-11-03 15:00:46 -04:00
2024-01-14 19:22:00 -05:00
switch ( R ) {
case 6 : {
unsigned char hash [ 32 ] , validationkey [ 32 ] ;
2014-04-07 16:39:54 -04:00
2024-01-14 19:22:00 -05:00
size_t pwlen = 0 ;
char password [ ] = " " ;
2020-07-30 22:52:27 -07:00
2024-01-14 19:22:00 -05:00
if ( NULL = = OE ) {
cli_dbgmsg ( " check_owner_password: Missing OE value! \n " ) ;
noisy_warnmsg ( " check_owner_password: Missing OE value! \n " ) ;
goto done ;
2014-04-07 16:39:54 -04:00
}
2016-12-17 12:47:54 +01:00
2024-01-14 19:22:00 -05:00
dbg_printhex ( " OE: " , OE , OE_len ) ;
2020-07-30 22:52:27 -07:00
2024-01-14 19:22:00 -05:00
/*
* Test the password against the owner key by computing the SHA - 256 hash of the UTF - 8 password concatenated
* with the 8 bytes of owner validation salt , concatenated with the 48 - byte U string .
*/
compute_hash_r6 (
password ,
pwlen ,
( const unsigned char * ) ( O + 32 ) , // owner validation salt
validationkey ,
U ) ;
/* If the 32-byte result matches the first 32 bytes of the O string, this is the owner password. */
if ( 0 ! = memcmp ( O , validationkey , sizeof ( validationkey ) ) ) {
cli_dbgmsg ( " check_owner_password: Owner password check did not match! \n " ) ;
break ;
}
2023-11-03 15:00:46 -04:00
2024-01-14 19:22:00 -05:00
/*
* Compute an intermediate owner key by computing the SHA - 256 hash of the UTF - 8 password concatenated with
* the 8 bytes of owner key salt , concatenated with the 48 - byte U string .
*/
compute_hash_r6 (
password ,
pwlen ,
( const unsigned char * ) ( O + 40 ) , // owner key salt
hash ,
U ) ;
if ( OE_len ! = 32 ) {
cli_dbgmsg ( " check_owner_password: OE length is not 32: %zu \n " , OE_len ) ;
noisy_warnmsg ( " check_owner_password: OE length is not 32: %zu \n " , OE_len ) ;
2021-04-06 10:05:10 -07:00
} else {
pdf - > keylen = 32 ;
2022-05-09 14:28:34 -07:00
pdf - > key = cli_max_malloc ( pdf - > keylen ) ;
2021-04-06 10:05:10 -07:00
if ( ! pdf - > key ) {
2024-01-14 19:22:00 -05:00
cli_errmsg ( " check_owner_password: Cannot allocate memory for pdf->key \n " ) ;
goto done ;
2021-04-06 10:05:10 -07:00
}
2016-12-17 12:47:54 +01:00
2024-01-14 19:22:00 -05:00
aes_256cbc_decrypt ( ( const unsigned char * ) OE , & OE_len , ( unsigned char * ) ( pdf - > key ) , ( char * ) hash , 32 , 0 ) ;
dbg_printhex ( " check_owner_password: Candidate encryption key " , pdf - > key , pdf - > keylen ) ;
2019-08-23 12:57:18 -04:00
2024-01-14 19:22:00 -05:00
password_empty = true ;
2021-04-06 10:05:10 -07:00
}
2024-01-14 19:22:00 -05:00
break ;
2016-12-17 12:47:54 +01:00
}
2024-01-14 19:22:00 -05:00
default : {
cli_dbgmsg ( " check_owner_password: Unknown or unsupported encryption version. R: %d \n " , R ) ;
noisy_warnmsg ( " check_owner_password: Unknown or unsupported encryption version. R: %d \n " , R ) ;
}
}
2014-02-08 00:31:12 -05:00
2024-01-14 19:22:00 -05:00
if ( password_empty ) {
/* The key we computed above is the key used to encrypt the streams. We could decrypt it now if we wanted to */
pdf - > flags | = 1 < < DECRYPTABLE_PDF ;
2014-02-08 00:31:12 -05:00
2024-01-14 19:22:00 -05:00
cli_dbgmsg ( " check_owner_password: encrypted PDF found, owner password is empty, will attempt to decrypt \n " ) ;
noisy_msg ( pdf , " check_owner_password: encrypted PDF found, owner password is empty, will attempt to decrypt \n " ) ;
} else {
/* The key is not valid, we would need the user or the owner password to decrypt */
cli_dbgmsg ( " check_owner_password: encrypted PDF found but cannot decrypt with empty owner password \n " ) ;
noisy_warnmsg ( " check_owner_password: encrypted PDF found but cannot decrypt with empty owner password \n " ) ;
}
2014-02-08 00:31:12 -05:00
2024-01-14 19:22:00 -05:00
done :
2014-04-07 16:39:54 -04:00
2024-01-14 19:22:00 -05:00
return ;
}
2014-04-07 16:39:54 -04:00
2024-01-14 19:22:00 -05:00
static void check_user_password ( struct pdf_struct * pdf , int R , const char * O ,
const char * U , int32_t P , int EM ,
const char * UE , size_t UE_len ,
unsigned length )
{
unsigned i ;
uint8_t result [ 16 ] ;
char data [ 32 ] ;
struct arc4_state arc4 ;
bool password_empty = false ;
2014-02-08 00:31:12 -05:00
2024-01-14 19:22:00 -05:00
dbg_printhex ( " U: " , U , 32 ) ;
dbg_printhex ( " O: " , O , 32 ) ;
2014-04-07 16:39:54 -04:00
2024-01-14 19:22:00 -05:00
switch ( R ) {
case 2 :
case 3 :
case 4 : {
2014-04-07 16:39:54 -04:00
unsigned char * d ;
2024-01-09 19:41:17 -05:00
size_t sz = 68 + pdf - > fileIDlen + ( R > = 4 & & ! EM ? 4 : 0 ) ;
d = calloc ( 1 , sz ) ;
2014-04-07 16:39:54 -04:00
if ( ! ( d ) )
2024-01-14 19:22:00 -05:00
goto done ;
2014-04-07 16:39:54 -04:00
memcpy ( d , key_padding , 32 ) ;
2024-01-14 19:22:00 -05:00
memcpy ( d + 32 , O , 32 ) ;
P = le32_to_host ( P ) ;
memcpy ( d + 64 , & P , 4 ) ;
memcpy ( d + 68 , pdf - > fileID , pdf - > fileIDlen ) ;
/* 7.6.3.3 Algorithm 2 */
/* empty password, password == padding */
if ( R > = 4 & & ! EM ) {
uint32_t v = 0xFFFFFFFF ;
memcpy ( d + 68 + pdf - > fileIDlen , & v , 4 ) ;
}
2014-04-07 16:39:54 -04:00
2024-01-14 19:22:00 -05:00
cl_hash_data ( " md5 " , d , sz , result , NULL ) ;
free ( d ) ;
if ( length > 128 )
length = 128 ;
if ( R > = 3 ) {
/* Yes, this really is on purpose */
for ( i = 0 ; i < 50 ; i + + )
cl_hash_data ( " md5 " , result , length / 8 , result , NULL ) ;
2020-05-12 17:25:00 -07:00
}
2024-01-14 19:22:00 -05:00
if ( R = = 2 )
length = 40 ;
pdf - > keylen = length / 8 ;
2022-05-09 14:28:34 -07:00
pdf - > key = cli_max_malloc ( pdf - > keylen ) ;
2024-01-14 19:22:00 -05:00
if ( ! pdf - > key )
goto done ;
memcpy ( pdf - > key , result , pdf - > keylen ) ;
dbg_printhex ( " md5 " , ( const char * ) result , 16 ) ;
dbg_printhex ( " Candidate encryption key " , pdf - > key , pdf - > keylen ) ;
2014-04-07 16:39:54 -04:00
2024-01-14 19:22:00 -05:00
/* 7.6.3.3 Algorithm 6 */
if ( R = = 2 ) {
/* 7.6.3.3 Algorithm 4 */
memcpy ( data , key_padding , 32 ) ;
if ( false = = arc4_init ( & arc4 , ( const uint8_t * ) ( pdf - > key ) , pdf - > keylen ) ) {
noisy_warnmsg ( " check_user_password: failed to init arc4 \n " ) ;
goto done ;
}
arc4_apply ( & arc4 , ( uint8_t * ) data , 32 ) ;
dbg_printhex ( " computed U (R2) " , data , 32 ) ;
if ( ! memcmp ( data , U , 32 ) )
password_empty = true ;
} else {
// R is 3 or 4
unsigned len = pdf - > keylen ;
unsigned char * d ;
d = calloc ( 1 , 32 + pdf - > fileIDlen ) ;
if ( ! ( d ) )
goto done ;
/* 7.6.3.3 Algorithm 5 */
memcpy ( d , key_padding , 32 ) ;
memcpy ( d + 32 , pdf - > fileID , pdf - > fileIDlen ) ;
cl_hash_data ( " md5 " , d , 32 + pdf - > fileIDlen , result , NULL ) ;
memcpy ( data , pdf - > key , len ) ;
2014-04-07 16:39:54 -04:00
2020-05-12 17:25:00 -07:00
if ( false = = arc4_init ( & arc4 , ( const uint8_t * ) data , len ) ) {
2020-07-30 22:52:27 -07:00
noisy_warnmsg ( " check_user_password: failed to init arc4 \n " ) ;
2024-01-14 19:22:00 -05:00
goto done ;
2020-05-12 17:25:00 -07:00
}
2014-04-07 16:39:54 -04:00
arc4_apply ( & arc4 , result , 16 ) ;
2024-01-14 19:22:00 -05:00
for ( i = 1 ; i < = 19 ; i + + ) {
unsigned j ;
for ( j = 0 ; j < len ; j + + )
data [ j ] = pdf - > key [ j ] ^ i ;
if ( false = = arc4_init ( & arc4 , ( const uint8_t * ) data , len ) ) {
noisy_warnmsg ( " check_user_password: failed to init arc4 \n " ) ;
goto done ;
}
arc4_apply ( & arc4 , result , 16 ) ;
}
dbg_printhex ( " fileID " , pdf - > fileID , pdf - > fileIDlen ) ;
dbg_printhex ( " computed U (R>=3) " , ( const char * ) result , 16 ) ;
if ( ! memcmp ( result , U , 16 ) )
password_empty = true ;
free ( d ) ;
2014-04-07 16:39:54 -04:00
}
2024-01-14 19:22:00 -05:00
break ;
2014-04-07 16:39:54 -04:00
}
2024-01-14 19:22:00 -05:00
case 5 : {
uint8_t result2 [ 32 ] ;
2014-04-07 16:39:54 -04:00
2024-01-14 19:22:00 -05:00
/* supplement to ISO3200, 3.5.2 Algorithm 3.11 */
/* user validation salt */
cl_sha256 ( U + 32 , 8 , result2 , NULL ) ;
dbg_printhex ( " Computed U " , ( const char * ) result2 , 32 ) ;
if ( ! memcmp ( result2 , U , 32 ) ) {
/* Algorithm 3.2a could be used to recover encryption key */
cl_sha256 ( U + 40 , 8 , result2 , NULL ) ;
if ( UE_len ! = 32 ) {
cli_dbgmsg ( " check_user_password: UE length is not 32: %zu \n " , UE_len ) ;
noisy_warnmsg ( " check_user_password: UE length is not 32: %zu \n " , UE_len ) ;
} else {
pdf - > keylen = 32 ;
2022-05-09 14:28:34 -07:00
pdf - > key = cli_max_malloc ( pdf - > keylen ) ;
2024-01-14 19:22:00 -05:00
if ( ! pdf - > key ) {
cli_errmsg ( " check_user_password: Cannot allocate memory for pdf->key \n " ) ;
goto done ;
}
aes_256cbc_decrypt ( ( const unsigned char * ) UE , & UE_len , ( unsigned char * ) ( pdf - > key ) , ( char * ) result2 , 32 , 0 ) ;
dbg_printhex ( " check_user_password: Candidate encryption key " , pdf - > key , pdf - > keylen ) ;
password_empty = true ;
}
}
break ;
}
case 6 : {
unsigned char hash [ 32 ] , validationkey [ 32 ] ;
size_t pwlen = 0 ;
char password [ ] = " " ;
if ( NULL = = UE ) {
cli_dbgmsg ( " check_user_password: Missing UE value! \n " ) ;
noisy_warnmsg ( " check_user_password: Missing UE value! \n " ) ;
goto done ;
}
dbg_printhex ( " UE: " , UE , UE_len ) ;
/*
* Test the password against the user key by computing the SHA - 256 hash of the UTF - 8 password concatenated
* with the 8 bytes of user validation salt .
*/
compute_hash_r6 (
password ,
pwlen ,
( const unsigned char * ) ( U + 32 ) , // user validation salt
validationkey ,
NULL ) ; // no U string for user password check
/* If the 32-byte result matches the first 32 bytes of the U string, this is the user password. */
if ( 0 ! = memcmp ( U , validationkey , sizeof ( validationkey ) ) ) {
cli_dbgmsg ( " check_user_password: User password check did not match! \n " ) ;
break ;
}
/*
* Compute an intermediate user key by computing the SHA - 256 hash of the UTF - 8 password concatenated with
* the 8 bytes of user key salt .
*/
compute_hash_r6 (
password ,
pwlen ,
( const unsigned char * ) ( U + 40 ) , // user key salt
hash ,
NULL ) ; // no U string for user password check
if ( UE_len ! = 32 ) {
cli_dbgmsg ( " check_user_password: UE length is not 32: %zu \n " , UE_len ) ;
noisy_warnmsg ( " check_user_password: UE length is not 32: %zu \n " , UE_len ) ;
} else {
pdf - > keylen = 32 ;
2022-05-09 14:28:34 -07:00
pdf - > key = cli_max_malloc ( pdf - > keylen ) ;
2024-01-14 19:22:00 -05:00
if ( ! pdf - > key ) {
cli_errmsg ( " check_user_password: Cannot allocate memory for pdf->key \n " ) ;
goto done ;
}
aes_256cbc_decrypt ( ( const unsigned char * ) UE , & UE_len , ( unsigned char * ) ( pdf - > key ) , ( char * ) hash , 32 , 0 ) ;
dbg_printhex ( " check_user_password: Candidate encryption key " , pdf - > key , pdf - > keylen ) ;
password_empty = true ;
}
break ;
}
default : {
/* Supported R is in {2,3,4,5} */
cli_dbgmsg ( " check_user_password: R value out of range \n " ) ;
noisy_warnmsg ( " check_user_password: R value out of range \n " ) ;
}
2013-03-12 10:45:44 -04:00
}
2014-04-07 16:39:54 -04:00
2011-05-07 18:06:06 +03:00
if ( password_empty ) {
2018-08-14 14:00:31 -07:00
cli_dbgmsg ( " check_user_password: user password is empty \n " ) ;
noisy_msg ( pdf , " check_user_password: encrypted PDF found, user password is empty, will attempt to decrypt \n " ) ;
2014-04-07 16:39:54 -04:00
/* The key we computed above is the key used to encrypt the streams.
* We could decrypt it now if we wanted to */
pdf - > flags | = 1 < < DECRYPTABLE_PDF ;
2011-05-07 18:06:06 +03:00
} else {
2014-04-07 16:39:54 -04:00
/* the key is not valid, we would need the user or the owner password to decrypt */
2018-08-14 14:00:31 -07:00
cli_dbgmsg ( " check_user_password: user/owner password would be required for decryption \n " ) ;
noisy_warnmsg ( " check_user_password: encrypted PDF found, user password is NOT empty, cannot decrypt! \n " ) ;
2011-05-07 18:06:06 +03:00
}
2024-01-14 19:22:00 -05:00
done :
return ;
2011-05-07 18:06:06 +03:00
}
2016-03-31 12:29:16 -04:00
enum enc_method parse_enc_method ( const char * dict , unsigned len , const char * key , enum enc_method def )
2011-12-15 17:24:36 +02:00
{
const char * q ;
2018-12-03 12:40:13 -05:00
char * CFM = NULL ;
2013-08-07 13:50:08 -04:00
enum enc_method ret = ENC_UNKNOWN ;
2014-04-07 16:39:54 -04:00
2011-12-15 17:24:36 +02:00
if ( ! key )
2014-04-07 16:39:54 -04:00
return def ;
2011-12-15 17:24:36 +02:00
if ( ! strcmp ( key , " Identity " ) )
2014-04-07 16:39:54 -04:00
return ENC_IDENTITY ;
2014-07-10 18:11:49 -04:00
q = pdf_getdict ( dict , ( int * ) ( & len ) , key ) ;
2011-12-15 17:24:36 +02:00
if ( ! q )
2014-04-07 16:39:54 -04:00
return def ;
2011-12-15 17:24:36 +02:00
CFM = pdf_readval ( q , len , " /CFM " ) ;
if ( CFM ) {
2018-08-14 14:00:31 -07:00
cli_dbgmsg ( " parse_enc_method: %s CFM: %s \n " , key , CFM ) ;
2018-12-03 12:40:13 -05:00
if ( ! strncmp ( CFM , " V2 " , 2 ) )
2014-04-07 16:39:54 -04:00
ret = ENC_V2 ;
2018-12-03 12:40:13 -05:00
else if ( ! strncmp ( CFM , " AESV2 " , 5 ) )
2014-04-07 16:39:54 -04:00
ret = ENC_AESV2 ;
2018-12-03 12:40:13 -05:00
else if ( ! strncmp ( CFM , " AESV3 " , 5 ) )
2014-04-07 16:39:54 -04:00
ret = ENC_AESV3 ;
2018-12-03 12:40:13 -05:00
else if ( ! strncmp ( CFM , " None " , 4 ) )
2014-04-07 16:39:54 -04:00
ret = ENC_NONE ;
free ( CFM ) ;
2011-12-15 17:24:36 +02:00
}
2014-04-07 16:39:54 -04:00
2013-08-07 13:50:08 -04:00
return ret ;
2011-12-15 17:24:36 +02:00
}
2015-03-20 15:10:52 -04:00
void pdf_handle_enc ( struct pdf_struct * pdf )
2011-05-07 18:06:06 +03:00
{
struct pdf_obj * obj ;
2014-07-10 18:11:49 -04:00
uint32_t len , n , R , P , length , EM = 1 , i , oulen ;
2024-01-14 19:22:00 -05:00
char * O = NULL ;
char * OE = NULL ;
size_t OE_len = 0 ;
char * U = NULL ;
char * UE = NULL ;
2023-11-03 15:00:46 -04:00
size_t UE_len = 0 ;
2024-01-14 19:22:00 -05:00
char * StmF = NULL ;
char * StrF = NULL ;
char * EFF = NULL ;
2011-05-07 18:06:06 +03:00
const char * q , * q2 ;
2011-10-08 13:36:12 +03:00
if ( pdf - > enc_objid = = ~ 0u )
2014-04-07 16:39:54 -04:00
return ;
2011-10-08 13:36:12 +03:00
if ( ! pdf - > fileID ) {
2018-08-14 14:00:31 -07:00
cli_dbgmsg ( " pdf_handle_enc: no file ID \n " ) ;
noisy_warnmsg ( " pdf_handle_enc: no file ID \n " ) ;
2014-04-07 16:39:54 -04:00
return ;
2011-10-08 13:36:12 +03:00
}
2014-04-07 16:39:54 -04:00
2018-08-14 14:00:31 -07:00
obj = find_obj ( pdf , pdf - > objs [ 0 ] , pdf - > enc_objid ) ;
2011-10-08 13:36:12 +03:00
if ( ! obj ) {
2024-07-16 11:22:05 -04:00
cli_dbgmsg ( " pdf_handle_enc: can't find encryption object %d %d \n " , pdf - > enc_objid > > 8 , pdf - > enc_objid & 0xff ) ;
noisy_warnmsg ( " pdf_handle_enc: can't find encryption object %d %d \n " , pdf - > enc_objid > > 8 , pdf - > enc_objid & 0xff ) ;
2014-04-07 16:39:54 -04:00
return ;
2011-10-08 13:36:12 +03:00
}
2014-04-07 16:39:54 -04:00
2019-03-05 21:15:41 -05:00
len = obj - > size ;
2022-01-06 16:53:44 -08:00
2024-07-16 11:22:05 -04:00
if ( NULL = = obj - > objstm ) {
q = ( const char * ) ( obj - > start + pdf - > map ) ;
if ( ! CLI_ISCONTAINED ( pdf - > map , pdf - > size , q , len ) ) {
cli_dbgmsg ( " pdf_handle_enc: encryption object found, but not contained in PDF \n " ) ;
noisy_warnmsg ( " pdf_handle_enc: encryption object found, but not contained in PDF \n " ) ;
return ;
}
} else {
q = ( const char * ) ( obj - > start + obj - > objstm - > streambuf ) ;
if ( ! CLI_ISCONTAINED ( obj - > objstm - > streambuf , obj - > objstm - > streambuf_len , q , len ) ) {
cli_dbgmsg ( " pdf_handle_enc: encryption object found, but not contained in PDF streambuf \n " ) ;
noisy_warnmsg ( " pdf_handle_enc: encryption object found, but not contained in PDF streambuf \n " ) ;
return ;
}
}
2011-05-07 18:06:06 +03:00
2011-12-15 17:24:36 +02:00
O = U = UE = StmF = StrF = EFF = NULL ;
2011-12-14 15:43:14 +02:00
2024-01-14 19:22:00 -05:00
pdf - > enc_method_string = ENC_UNKNOWN ;
pdf - > enc_method_stream = ENC_UNKNOWN ;
pdf - > enc_method_embeddedfile = ENC_UNKNOWN ;
2011-05-07 18:06:06 +03:00
2024-01-14 19:22:00 -05:00
q2 = cli_memstr ( q , len , " /Standard " , 9 ) ;
if ( ! q2 ) {
cli_dbgmsg ( " pdf_handle_enc: /Standard not found \n " ) ;
noisy_warnmsg ( " pdf_handle_enc: /Standard not found \n " ) ;
goto done ;
}
2011-05-07 18:06:06 +03:00
2024-01-14 19:22:00 -05:00
/* we can have both of these:
* / AESV2 / Length / Standard / Length
* / Length / Standard
* make sure we don ' t mistake AES ' s length for Standard ' s */
length = pdf_readint ( q2 , len - ( q2 - q ) , " /Length " ) ;
if ( length = = ~ 0u )
length = pdf_readint ( q , len , " /Length " ) ;
2014-04-07 16:39:54 -04:00
2024-01-14 19:22:00 -05:00
if ( length < 40 ) {
cli_dbgmsg ( " pdf_handle_enc: invalid length: %d \n " , length ) ;
length = 40 ;
}
2011-05-07 18:06:06 +03:00
2024-01-14 19:22:00 -05:00
R = pdf_readint ( q , len , " /R " ) ;
if ( R = = ~ 0u ) {
cli_dbgmsg ( " pdf_handle_enc: invalid R \n " ) ;
noisy_warnmsg ( " pdf_handle_enc: invalid R \n " ) ;
goto done ;
}
2011-05-07 18:06:06 +03:00
2024-01-14 19:22:00 -05:00
if ( ( R > 6 ) | | ( R < 2 ) ) {
cli_dbgmsg ( " pdf_handle_enc: R value outside supported range [2..6] \n " ) ;
noisy_warnmsg ( " pdf_handle_enc: R value outside supported range [2..6] \n " ) ;
goto done ;
}
2014-04-07 16:39:54 -04:00
2024-01-14 19:22:00 -05:00
P = pdf_readint ( q , len , " /P " ) ;
if ( R < 6 ) { // P field doesn't seem to be required for R6.
if ( P = = ~ 0u ) {
cli_dbgmsg ( " pdf_handle_enc: invalid P \n " ) ;
noisy_warnmsg ( " pdf_handle_enc: invalid P \n " ) ;
goto done ;
2019-08-23 12:57:18 -04:00
}
2024-01-14 19:22:00 -05:00
}
2019-08-23 12:57:18 -04:00
2024-01-14 19:22:00 -05:00
if ( R < 5 ) {
oulen = 32 ;
} else {
oulen = 48 ;
}
if ( R = = 2 | | R = = 3 ) {
pdf - > enc_method_stream = ENC_V2 ;
pdf - > enc_method_string = ENC_V2 ;
pdf - > enc_method_embeddedfile = ENC_V2 ;
} else if ( R = = 4 | | R = = 5 | | R = = 6 ) {
EM = pdf_readbool ( q , len , " /EncryptMetadata " , 1 ) ;
StmF = pdf_readval ( q , len , " /StmF " ) ;
StrF = pdf_readval ( q , len , " /StrF " ) ;
EFF = pdf_readval ( q , len , " /EFF " ) ;
n = len ;
pdf - > CF = pdf_getdict ( q , ( int * ) ( & n ) , " /CF " ) ;
pdf - > CF_n = n ;
if ( StmF ) {
cli_dbgmsg ( " pdf_handle_enc: StmF: %s \n " , StmF ) ;
}
if ( StrF ) {
cli_dbgmsg ( " pdf_handle_enc: StrF: %s \n " , StrF ) ;
}
if ( EFF ) {
cli_dbgmsg ( " pdf_handle_enc: EFF: %s \n " , EFF ) ;
2014-04-07 16:39:54 -04:00
}
2024-01-14 19:22:00 -05:00
pdf - > enc_method_stream = parse_enc_method ( pdf - > CF , n , StmF , ENC_IDENTITY ) ;
pdf - > enc_method_string = parse_enc_method ( pdf - > CF , n , StrF , ENC_IDENTITY ) ;
pdf - > enc_method_embeddedfile = parse_enc_method ( pdf - > CF , n , EFF , pdf - > enc_method_stream ) ;
2014-04-07 16:39:54 -04:00
2024-01-14 19:22:00 -05:00
cli_dbgmsg ( " pdf_handle_enc: EncryptMetadata: %s \n " , EM ? " true " : " false " ) ;
2014-04-07 16:39:54 -04:00
2024-01-14 19:22:00 -05:00
if ( R = = 4 ) {
length = 128 ;
} else {
length = 256 ;
/*
* Read the UE value ( for checking user - password )
*/
n = 0 ;
UE = pdf_readstring ( q , len , " /UE " , & n , NULL , false ) ;
UE_len = n ;
/*
* Read the OE value ( for checking owner - password )
*/
n = 0 ;
OE = pdf_readstring ( q , len , " /OE " , & n , NULL , false ) ;
OE_len = n ;
2014-04-07 16:39:54 -04:00
}
2024-01-14 19:22:00 -05:00
}
if ( length = = ~ 0u )
length = 40 ;
2014-04-07 16:39:54 -04:00
2024-01-14 19:22:00 -05:00
/*
* Read the O value
*/
n = 0 ;
O = pdf_readstring ( q , len , " /O " , & n , NULL , false ) ;
if ( ! O | | n < oulen ) {
cli_dbgmsg ( " pdf_handle_enc: invalid O: %d \n " , n ) ;
noisy_warnmsg ( " pdf_handle_enc: invalid O: %d \n " , n ) ;
if ( O ) {
dbg_printhex ( " invalid O " , O , n ) ;
}
goto done ;
}
if ( n > oulen ) {
for ( i = oulen ; i < n ; i + + ) {
if ( O [ i ] ) {
2018-08-14 14:00:31 -07:00
dbg_printhex ( " pdf_handle_enc: too long O " , O , n ) ;
noisy_warnmsg ( " pdf_handle_enc: too long O: %u " , n ) ;
2024-01-14 19:22:00 -05:00
goto done ;
2014-04-07 16:39:54 -04:00
}
}
2024-01-14 19:22:00 -05:00
}
2014-04-07 16:39:54 -04:00
2024-01-14 19:22:00 -05:00
/*
* Read the U value
*/
n = 0 ;
U = pdf_readstring ( q , len , " /U " , & n , NULL , false ) ;
if ( ! U | | n < oulen ) {
cli_dbgmsg ( " pdf_handle_enc: invalid U: %u \n " , n ) ;
noisy_warnmsg ( " pdf_handle_enc: invalid U: %u \n " , n ) ;
if ( U ) {
dbg_printhex ( " invalid U " , U , n ) ;
2014-04-07 16:39:54 -04:00
}
2024-01-14 19:22:00 -05:00
goto done ;
}
if ( n > oulen ) {
for ( i = oulen ; i < n ; i + + ) {
if ( U [ i ] ) {
2014-04-07 16:39:54 -04:00
dbg_printhex ( " too long U " , U , n ) ;
2024-01-14 19:22:00 -05:00
goto done ;
2014-04-07 16:39:54 -04:00
}
}
2024-01-14 19:22:00 -05:00
}
2014-04-07 16:39:54 -04:00
2024-01-14 19:22:00 -05:00
cli_dbgmsg ( " pdf_handle_enc: Encrypt R: %d, P %x, length: %u \n " , R , P , length ) ;
if ( length % 8 ) {
cli_dbgmsg ( " pdf_handle_enc: wrong key length, not multiple of 8 \n " ) ;
noisy_warnmsg ( " pdf_handle_enc: wrong key length, not multiple of 8 \n " ) ;
goto done ;
}
// Check the owner password.
check_owner_password ( pdf , R , O , U , OE , OE_len ) ;
if ( NULL = = pdf - > key ) {
// Wasn't the owner password, let's try the user password.
check_user_password ( pdf , R , O , U , P , EM , UE , UE_len , length ) ;
}
2014-04-07 16:39:54 -04:00
2024-01-14 19:22:00 -05:00
done :
2011-05-07 18:06:06 +03:00
free ( O ) ;
2024-01-14 19:22:00 -05:00
free ( OE ) ;
2011-05-07 18:06:06 +03:00
free ( U ) ;
2011-12-15 13:27:31 +02:00
free ( UE ) ;
2024-01-14 19:22:00 -05:00
free ( StmF ) ;
free ( StrF ) ;
free ( EFF ) ;
2011-05-07 18:06:06 +03:00
}
2018-08-14 14:00:31 -07:00
/**
2019-03-05 21:15:41 -05:00
* @ brief Search pdf buffer for objects . Parse each .
*
2018-08-14 14:00:31 -07:00
* Newly found objects will be extracted after completion when the extraction for loop continues .
2019-03-05 21:15:41 -05:00
*
* @ param pdf Pdf struct that keeps track of all information found in the PDF .
2018-08-14 14:00:31 -07:00
* @ param objstm Pointer to an object stream to parse .
2019-03-05 21:15:41 -05:00
*
2018-08-14 14:00:31 -07:00
* @ return cl_error_t Error code .
*/
cl_error_t pdf_find_and_parse_objs_in_objstm ( struct pdf_struct * pdf , struct objstm_struct * objstm )
{
2019-03-05 21:15:41 -05:00
cl_error_t status = CL_EFORMAT ;
cl_error_t retval = CL_EPARSE ;
2018-08-14 14:00:31 -07:00
uint32_t badobjects = 0 ;
2018-12-03 12:40:13 -05:00
size_t i = 0 ;
2018-08-14 14:00:31 -07:00
2018-12-03 12:40:13 -05:00
struct pdf_obj * obj = NULL ;
2018-08-14 14:00:31 -07:00
2019-01-22 14:15:46 -05:00
if ( ( NULL = = objstm ) | | ( NULL = = objstm - > streambuf ) ) {
status = CL_EARG ;
goto done ;
}
2018-12-03 12:40:13 -05:00
if ( ( 0 = = objstm - > first ) | |
( 0 = = objstm - > streambuf_len ) | |
( 0 = = objstm - > n ) ) {
2018-08-14 14:00:31 -07:00
cli_dbgmsg ( " pdf_find_and_parse_objs_in_objstm: Empty object stream. \n " ) ;
goto done ;
}
2018-12-03 12:40:13 -05:00
if ( objstm - > first > = objstm - > streambuf_len ) {
2018-08-14 14:00:31 -07:00
cli_dbgmsg ( " pdf_find_and_parse_objs_in_objstm: Invalid objstm values. Offset of first obj greater than stream length. \n " ) ;
goto done ;
}
/* Process each object */
2018-12-03 12:40:13 -05:00
for ( i = 0 ; i < objstm - > n ; i + + ) {
2018-08-14 14:00:31 -07:00
obj = NULL ;
if ( cli_checktimelimit ( pdf - > ctx ) ! = CL_SUCCESS ) {
2022-09-17 11:30:32 -07:00
cli_dbgmsg ( " Timeout reached in the PDF parser while parsing object stream. \n " ) ;
2018-08-14 14:00:31 -07:00
status = CL_ETIMEOUT ;
goto done ;
}
/* Find object */
retval = pdf_findobj_in_objstm ( pdf , objstm , & obj ) ;
2018-12-03 12:40:13 -05:00
if ( retval ! = CL_SUCCESS ) {
2019-07-15 15:10:24 -07:00
if ( retval ! = CL_BREAK ) {
2025-06-30 10:47:02 -04:00
cli_dbgmsg ( " pdf_find_and_parse_objs_in_objstm: Fewer objects in stream than expected: %zu found, %zu expected. \n " ,
2020-01-03 15:53:29 -05:00
objstm - > nobjs_found , objstm - > n ) ;
2019-07-15 15:10:24 -07:00
badobjects + + ;
pdf - > stats . ninvalidobjs + + ;
}
2018-08-14 14:00:31 -07:00
break ;
}
cli_dbgmsg ( " pdf_find_and_parse_objs_in_objstm: Found object %u %u in object stream at offset: %u \n " , obj - > id > > 8 , obj - > id & 0xff , obj - > start ) ;
if ( cli_checktimelimit ( pdf - > ctx ) ! = CL_SUCCESS ) {
2022-09-17 11:30:32 -07:00
cli_dbgmsg ( " Timeout reached in the PDF parser while parsing object stream. \n " ) ;
2018-08-14 14:00:31 -07:00
status = CL_ETIMEOUT ;
goto done ;
}
/* Parse object */
pdf_parseobj ( pdf , obj ) ;
}
2022-08-16 18:55:06 -07:00
if ( badobjects ) {
2018-08-14 14:00:31 -07:00
status = CL_EFORMAT ;
goto done ;
}
2018-12-03 12:40:13 -05:00
2018-08-14 14:00:31 -07:00
status = CL_SUCCESS ;
done :
return status ;
}
/**
* @ brief Search pdf buffer for objects . Parse each and then extract each .
2019-03-05 21:15:41 -05:00
*
2018-08-14 14:00:31 -07:00
* @ param pdf Pdf struct that keeps track of all information found in the PDF .
2019-03-05 21:15:41 -05:00
*
2021-07-14 15:33:52 -07:00
* @ return cl_error_t Error code .
2018-08-14 14:00:31 -07:00
*/
2022-08-16 18:55:06 -07:00
static cl_error_t pdf_find_and_extract_objs ( struct pdf_struct * pdf )
2018-08-14 14:00:31 -07:00
{
2019-03-05 21:15:41 -05:00
cl_error_t status = CL_SUCCESS ;
int32_t rv = 0 ;
unsigned int i = 0 ;
2018-08-14 14:00:31 -07:00
uint32_t badobjects = 0 ;
2019-05-05 14:58:22 -04:00
cli_ctx * ctx = NULL ;
2018-08-14 14:00:31 -07:00
2022-08-16 18:55:06 -07:00
if ( NULL = = pdf ) {
2018-11-14 16:58:30 -05:00
cli_errmsg ( " pdf_find_and_extract_objs: Invalid arguments. \n " ) ;
status = CL_EARG ;
goto done ;
}
2019-05-05 14:58:22 -04:00
ctx = pdf - > ctx ;
2018-08-14 14:00:31 -07:00
/* parse PDF and find obj offsets */
while ( CL_BREAK ! = ( rv = pdf_findobj ( pdf ) ) ) {
if ( rv = = CL_EMEM ) {
2019-07-15 15:10:24 -07:00
cli_errmsg ( " pdf_find_and_extract_objs: Memory allocation error. \n " ) ;
status = CL_EMEM ;
goto done ;
2018-08-14 14:00:31 -07:00
}
}
/* must parse after finding all objs, so we can flag indirect objects */
2018-12-03 12:40:13 -05:00
for ( i = 0 ; i < pdf - > nobjs ; i + + ) {
2018-08-14 14:00:31 -07:00
struct pdf_obj * obj = pdf - > objs [ i ] ;
if ( cli_checktimelimit ( pdf - > ctx ) ! = CL_SUCCESS ) {
2022-09-17 11:30:32 -07:00
cli_dbgmsg ( " pdf_find_and_extract_objs: Timeout reached in the PDF parser while parsing objects. \n " ) ;
2018-08-14 14:00:31 -07:00
status = CL_ETIMEOUT ;
goto done ;
}
pdf_parseobj ( pdf , obj ) ;
}
pdf_handle_enc ( pdf ) ;
if ( pdf - > flags & ( 1 < < ENCRYPTED_PDF ) )
cli_dbgmsg ( " pdf_find_and_extract_objs: encrypted pdf found, %s! \n " ,
2018-12-03 12:40:13 -05:00
( pdf - > flags & ( 1 < < DECRYPTABLE_PDF ) ) ? " decryptable " : " not decryptable, stream will probably fail to decompress " ) ;
2018-08-14 14:00:31 -07:00
2018-10-10 06:02:28 -07:00
if ( SCAN_HEURISTIC_ENCRYPTED_DOC & &
2018-12-03 12:40:13 -05:00
( pdf - > flags & ( 1 < < ENCRYPTED_PDF ) ) & &
! ( pdf - > flags & ( 1 < < DECRYPTABLE_PDF ) ) ) {
2018-08-14 14:00:31 -07:00
/* It is encrypted, and a password/key needs to be supplied to decrypt.
* This doesn ' t trigger for PDFs that are encrypted but don ' t need
* a password to decrypt */
2022-08-19 16:21:42 -07:00
status = cli_append_potentially_unwanted ( pdf - > ctx , " Heuristics.Encrypted.PDF " ) ;
2018-08-14 14:00:31 -07:00
}
2019-05-05 14:58:22 -04:00
if ( CL_SUCCESS = = status ) {
2025-06-08 01:12:33 -04:00
status = run_pdf_hooks ( pdf , PDF_PHASE_PARSED , - 1 , NULL ) ;
2018-08-14 14:00:31 -07:00
cli_dbgmsg ( " pdf_find_and_extract_objs: (parsed hooks) returned %d \n " , status ) ;
}
2022-08-16 18:55:06 -07:00
if ( CL_SUCCESS = = status ) {
/* extract PDF objs */
for ( i = 0 ; ! status & & i < pdf - > nobjs ; i + + ) {
struct pdf_obj * obj = pdf - > objs [ i ] ;
2018-08-14 14:00:31 -07:00
2022-08-16 18:55:06 -07:00
if ( cli_checktimelimit ( pdf - > ctx ) ! = CL_SUCCESS ) {
2022-09-17 11:30:32 -07:00
cli_dbgmsg ( " pdf_find_and_extract_objs: Timeout reached in the PDF parser while extracting objects. \n " ) ;
2018-08-14 14:00:31 -07:00
2022-08-16 18:55:06 -07:00
status = CL_ETIMEOUT ;
goto done ;
}
2018-08-14 14:00:31 -07:00
2022-07-01 17:32:00 -07:00
pdf - > parse_recursion_depth + + ;
2022-08-16 18:55:06 -07:00
status = pdf_extract_obj ( pdf , obj , PDF_EXTRACT_OBJ_SCAN ) ;
2022-07-01 17:32:00 -07:00
pdf - > parse_recursion_depth - - ;
2022-08-16 18:55:06 -07:00
switch ( status ) {
case CL_EFORMAT :
/* Don't halt on one bad object */
cli_dbgmsg ( " pdf_find_and_extract_objs: Format error when extracting object, skipping to the next object. \n " ) ;
badobjects + + ;
pdf - > stats . ninvalidobjs + + ;
2018-08-14 14:00:31 -07:00
status = CL_CLEAN ;
2022-08-16 18:55:06 -07:00
break ;
case CL_VIRUS :
break ;
default :
break ;
}
2018-08-14 14:00:31 -07:00
}
}
done :
2019-05-05 14:58:22 -04:00
if ( ( CL_SUCCESS = = status ) & & badobjects ) {
2018-08-14 14:00:31 -07:00
status = CL_EFORMAT ;
}
return status ;
}
/**
* @ brief Primary function for parsing and scanning a PDF .
2019-03-05 21:15:41 -05:00
*
2018-08-14 14:00:31 -07:00
* @ param dir Filepath for temp file .
2019-03-05 21:15:41 -05:00
* @ param ctx clam scan context structure .
2018-08-14 14:00:31 -07:00
* @ param offset offset of pdf in ctx - > fmap
2019-03-05 21:15:41 -05:00
*
2018-08-14 14:00:31 -07:00
* @ return int Returns cl_error_t status value .
*/
2020-03-19 21:23:54 -04:00
cl_error_t cli_pdf ( const char * dir , cli_ctx * ctx , off_t offset )
2010-05-10 11:57:44 +03:00
{
2018-08-14 14:00:31 -07:00
cl_error_t rc = CL_SUCCESS ;
2010-05-10 11:57:44 +03:00
struct pdf_struct pdf ;
libclamav: Fix scan recursion tracking
Scan recursion is the process of identifying files embedded in other
files and then scanning them, recursively.
Internally this process is more complex than it may sound because a file
may have multiple layers of types before finding a new "file".
At present we treat the recursion count in the scanning context as an
index into both our fmap list AND our container list. These two lists
are conceptually a part of the same thing and should be unified.
But what's concerning is that the "recursion level" isn't actually
incremented or decremented at the same time that we add a layer to the
fmap or container lists but instead is more touchy-feely, increasing
when we find a new "file".
To account for this shadiness, the size of the fmap and container lists
has always been a little longer than our "max scan recursion" limit so
we don't accidentally overflow the fmap or container arrays (!).
I've implemented a single recursion-stack as an array, similar to before,
which includes a pointer to each fmap at each layer, along with the size
and type. Push and pop functions add and remove layers whenever a new
fmap is added. A boolean argument when pushing indicates if the new layer
represents a new buffer or new file (descriptor). A new buffer will reset
the "nested fmap level" (described below).
This commit also provides a solution for an issue where we detect
embedded files more than once during scan recursion.
For illustration, imagine a tarball named foo.tar.gz with this structure:
| description | type | rec level | nested fmap level |
| ------------------------- | ----- | --------- | ----------------- |
| foo.tar.gz | GZ | 0 | 0 |
| └── foo.tar | TAR | 1 | 0 |
| ├── bar.zip | ZIP | 2 | 1 |
| │ └── hola.txt | ASCII | 3 | 0 |
| └── baz.exe | PE | 2 | 1 |
But suppose baz.exe embeds a ZIP archive and a 7Z archive, like this:
| description | type | rec level | nested fmap level |
| ------------------------- | ----- | --------- | ----------------- |
| baz.exe | PE | 0 | 0 |
| ├── sfx.zip | ZIP | 1 | 1 |
| │ └── hello.txt | ASCII | 2 | 0 |
| └── sfx.7z | 7Z | 1 | 1 |
| └── world.txt | ASCII | 2 | 0 |
(A) If we scan for embedded files at any layer, we may detect:
| description | type | rec level | nested fmap level |
| ------------------------- | ----- | --------- | ----------------- |
| foo.tar.gz | GZ | 0 | 0 |
| ├── foo.tar | TAR | 1 | 0 |
| │ ├── bar.zip | ZIP | 2 | 1 |
| │ │ └── hola.txt | ASCII | 3 | 0 |
| │ ├── baz.exe | PE | 2 | 1 |
| │ │ ├── sfx.zip | ZIP | 3 | 1 |
| │ │ │ └── hello.txt | ASCII | 4 | 0 |
| │ │ └── sfx.7z | 7Z | 3 | 1 |
| │ │ └── world.txt | ASCII | 4 | 0 |
| │ ├── sfx.zip | ZIP | 2 | 1 |
| │ │ └── hello.txt | ASCII | 3 | 0 |
| │ └── sfx.7z | 7Z | 2 | 1 |
| │ └── world.txt | ASCII | 3 | 0 |
| ├── sfx.zip | ZIP | 1 | 1 |
| └── sfx.7z | 7Z | 1 | 1 |
(A) is bad because it scans content more than once.
Note that for the GZ layer, it may detect the ZIP and 7Z if the
signature hits on the compressed data, which it might, though
extracting the ZIP and 7Z will likely fail.
The reason the above doesn't happen now is that we restrict embedded
type scans for a bunch of archive formats to include GZ and TAR.
(B) If we scan for embedded files at the foo.tar layer, we may detect:
| description | type | rec level | nested fmap level |
| ------------------------- | ----- | --------- | ----------------- |
| foo.tar.gz | GZ | 0 | 0 |
| └── foo.tar | TAR | 1 | 0 |
| ├── bar.zip | ZIP | 2 | 1 |
| │ └── hola.txt | ASCII | 3 | 0 |
| ├── baz.exe | PE | 2 | 1 |
| ├── sfx.zip | ZIP | 2 | 1 |
| │ └── hello.txt | ASCII | 3 | 0 |
| └── sfx.7z | 7Z | 2 | 1 |
| └── world.txt | ASCII | 3 | 0 |
(B) is almost right. But we can achieve it easily enough only scanning for
embedded content in the current fmap when the "nested fmap level" is 0.
The upside is that it should safely detect all embedded content, even if
it may think the sfz.zip and sfx.7z are in foo.tar instead of in baz.exe.
The biggest risk I can think of affects ZIPs. SFXZIP detection
is identical to ZIP detection, which is why we don't allow SFXZIP to be
detected if insize of a ZIP. If we only allow embedded type scanning at
fmap-layer 0 in each buffer, this will fail to detect the embedded ZIP
if the bar.exe was not compressed in foo.zip and if non-compressed files
extracted from ZIPs aren't extracted as new buffers:
| description | type | rec level | nested fmap level |
| ------------------------- | ----- | --------- | ----------------- |
| foo.zip | ZIP | 0 | 0 |
| └── bar.exe | PE | 1 | 1 |
| └── sfx.zip | ZIP | 2 | 2 |
Provided that we ensure all files extracted from zips are scanned in
new buffers, option (B) should be safe.
(C) If we scan for embedded files at the baz.exe layer, we may detect:
| description | type | rec level | nested fmap level |
| ------------------------- | ----- | --------- | ----------------- |
| foo.tar.gz | GZ | 0 | 0 |
| └── foo.tar | TAR | 1 | 0 |
| ├── bar.zip | ZIP | 2 | 1 |
| │ └── hola.txt | ASCII | 3 | 0 |
| └── baz.exe | PE | 2 | 1 |
| ├── sfx.zip | ZIP | 3 | 1 |
| │ └── hello.txt | ASCII | 4 | 0 |
| └── sfx.7z | 7Z | 3 | 1 |
| └── world.txt | ASCII | 4 | 0 |
(C) is right. But it's harder to achieve. For this example we can get it by
restricting 7ZSFX and ZIPSFX detection only when scanning an executable.
But that may mean losing detection of archives embedded elsewhere.
And we'd have to identify allowable container types for each possible
embedded type, which would be very difficult.
So this commit aims to solve the issue the (B)-way.
Note that in all situations, we still have to scan with file typing
enabled to determine if we need to reassign the current file type, such
as re-identifying a Bzip2 archive as a DMG that happens to be Bzip2-
compressed. Detection of DMG and a handful of other types rely on
finding data partway through or near the ned of a file before
reassigning the entire file as the new type.
Other fixes and considerations in this commit:
- The utf16 HTML parser has weak error handling, particularly with respect
to creating a nested fmap for scanning the ascii decoded file.
This commit cleans up the error handling and wraps the nested scan with
the recursion-stack push()/pop() for correct recursion tracking.
Before this commit, each container layer had a flag to indicate if the
container layer is valid.
We need something similar so that the cli_recursion_stack_get_*()
functions ignore normalized layers. Details...
Imagine an LDB signature for HTML content that specifies a ZIP
container. If the signature actually alerts on the normalized HTML and
you don't ignore normalized layers for the container check, it will
appear as though the alert is in an HTML container rather than a ZIP
container.
This commit accomplishes this with a boolean you set in the scan context
before scanning a new layer. Then when the new fmap is created, it will
use that flag to set similar flag for the layer. The context flag is
reset those that anything after this doesn't have that flag.
The flag allows the new recursion_stack_get() function to ignore
normalized layers when iterating the stack to return a layer at a
requested index, negative or positive.
Scanning normalized extracted/normalized javascript and VBA should also
use the 'layer is normalized' flag.
- This commit also fixes Heuristic.Broken.Executable alert for ELF files
to make sure that:
A) these only alert if cli_append_virus() returns CL_VIRUS (aka it
respects the FP check).
B) all broken-executable alerts for ELF only happen if the
SCAN_HEURISTIC_BROKEN option is enabled.
- This commit also cleans up the error handling in cli_magic_scan_dir().
This was needed so we could correctly apply the layer-is-normalized-flag
to all VBA macros extracted to a directory when scanning the directory.
- Also fix an issue where exceeding scan maximums wouldn't cause embedded
file detection scans to abort. Granted we don't actually want to abort
if max filesize or max recursion depth are exceeded... only if max
scansize, max files, and max scantime are exceeded.
Add 'abort_scan' flag to scan context, to protect against depending on
correct error propagation for fatal conditions. Instead, setting this
flag in the scan context should guarantee that a fatal condition deep in
scan recursion isn't lost which result in more stuff being scanned
instead of aborting. This shouldn't be necessary, but some status codes
like CL_ETIMEOUT never used to be fatal and it's easier to do this than
to verify every parser only returns CL_ETIMEOUT and other "fatal
status codes" in fatal conditions.
- Remove duplicate is_tar() prototype from filestypes.c and include
is_tar.h instead.
- Presently we create the fmap hash when creating the fmap.
This wastes a bit of CPU if the hash is never needed.
Now that we're creating fmap's for all embedded files discovered with
file type recognition scans, this is a much more frequent occurence and
really slows things down.
This commit fixes the issue by only creating fmap hashes as needed.
This should not only resolve the perfomance impact of creating fmap's
for all embedded files, but also should improve performance in general.
- Add allmatch check to the zip parser after the central-header meta
match. That way we don't multiple alerts with the same match except in
allmatch mode. Clean up error handling in the zip parser a tiny bit.
- Fixes to ensure that the scan limits such as scansize, filesize,
recursion depth, # of embedded files, and scantime are always reported
if AlertExceedsMax (--alert-exceeds-max) is enabled.
- Fixed an issue where non-fatal alerts for exceeding scan maximums may
mask signature matches later on. I changed it so these alerts use the
"possibly unwanted" alert-type and thus only alert if no other alerts
were found or if all-match or heuristic-precedence are enabled.
- Added the "Heuristics.Limits.Exceeded.*" events to the JSON metadata
when the --gen-json feature is enabled. These will show up once under
"ParseErrors" the first time a limit is exceeded. In the present
implementation, only one limits-exceeded events will be added, so as to
prevent a malicious or malformed sample from filling the JSON buffer
with millions of events and using a tonne of RAM.
2021-09-11 14:15:21 -07:00
fmap_t * map = ctx - > fmap ;
2018-12-03 12:40:13 -05:00
size_t size = map - > len - offset ;
2010-05-10 11:57:44 +03:00
off_t versize = size > 1032 ? 1032 : size ;
off_t map_off , bytesleft ;
2018-06-12 20:47:21 -04:00
unsigned long xref ;
2019-01-22 14:15:46 -05:00
long temp_long ;
2018-03-08 12:17:11 -05:00
const char * pdfver , * tmp , * start , * eofmap , * q , * eof ;
2022-08-16 18:55:06 -07:00
unsigned i ;
2018-08-14 14:00:31 -07:00
unsigned int objs_found = 0 ;
2024-03-25 13:01:46 -04:00
2018-12-03 12:40:13 -05:00
json_object * pdfobj = NULL ;
2014-06-30 14:06:37 -04:00
char * begin , * end , * p1 ;
2010-05-10 11:57:44 +03:00
cli_dbgmsg ( " in cli_pdf(%s) \n " , dir ) ;
memset ( & pdf , 0 , sizeof ( pdf ) ) ;
2018-12-03 12:40:13 -05:00
pdf . ctx = ctx ;
pdf . dir = dir ;
2011-05-07 18:06:06 +03:00
pdf . enc_objid = ~ 0u ;
2010-05-10 11:57:44 +03:00
pdfver = start = fmap_need_off_once ( map , offset , versize ) ;
/* Check PDF version */
if ( ! pdfver ) {
2014-04-07 16:39:54 -04:00
cli_errmsg ( " cli_pdf: mmap() failed (1) \n " ) ;
2018-08-14 14:00:31 -07:00
rc = CL_EMAP ;
goto done ;
2010-05-10 11:57:44 +03:00
}
2014-04-07 16:39:54 -04:00
2025-06-08 01:12:33 -04:00
if ( ctx - > this_layer_metadata_json )
pdfobj = cli_jsonobj ( ctx - > this_layer_metadata_json , " PDFStats " ) ;
2014-06-25 13:36:30 -04:00
2010-05-10 11:57:44 +03:00
/* offset is 0 when coming from filetype2 */
2018-03-08 12:17:11 -05:00
tmp = cli_memstr ( pdfver , versize , " %PDF- " , 5 ) ;
if ( ! tmp ) {
2014-04-07 16:39:54 -04:00
cli_dbgmsg ( " cli_pdf: no PDF- header found \n " ) ;
noisy_warnmsg ( " cli_pdf: no PDF- header found \n " ) ;
2022-05-31 19:15:06 -04:00
2018-08-14 14:00:31 -07:00
rc = CL_SUCCESS ;
goto done ;
2010-05-10 11:57:44 +03:00
}
2014-04-07 16:39:54 -04:00
2018-03-08 12:17:11 -05:00
versize - = tmp - pdfver ;
pdfver = tmp ;
if ( versize < 8 ) {
2018-08-14 14:00:31 -07:00
rc = CL_EFORMAT ;
goto done ;
2018-03-08 12:17:11 -05:00
}
2014-04-07 16:39:54 -04:00
/* Check for PDF-1.[0-9]. Although 1.7 is highest now, allow for future versions */
2010-05-10 11:57:44 +03:00
if ( pdfver [ 5 ] ! = ' 1 ' | | pdfver [ 6 ] ! = ' . ' | |
2014-04-07 16:39:54 -04:00
pdfver [ 7 ] < ' 1 ' | | pdfver [ 7 ] > ' 9 ' ) {
pdf . flags | = 1 < < BAD_PDF_VERSION ;
cli_dbgmsg ( " cli_pdf: bad pdf version: %.8s \n " , pdfver ) ;
2024-03-25 13:01:46 -04:00
2014-06-25 13:36:30 -04:00
if ( pdfobj )
cli_jsonbool ( pdfobj , " BadVersion " , 1 ) ;
2014-06-30 14:06:37 -04:00
} else {
if ( pdfobj ) {
2018-12-03 12:40:13 -05:00
begin = ( char * ) ( pdfver + 5 ) ;
end = begin + 2 ;
2014-06-30 14:06:37 -04:00
strtoul ( end , & end , 10 ) ;
2022-05-09 14:28:34 -07:00
p1 = cli_max_calloc ( ( end - begin ) + 2 , 1 ) ;
2014-06-30 14:06:37 -04:00
if ( p1 ) {
strncpy ( p1 , begin , end - begin ) ;
p1 [ end - begin ] = ' \0 ' ;
cli_jsonstr ( pdfobj , " PDFVersion " , p1 ) ;
free ( p1 ) ;
}
}
2010-05-10 11:57:44 +03:00
}
2014-04-07 16:39:54 -04:00
2010-05-10 11:57:44 +03:00
if ( pdfver ! = start | | offset ) {
2014-04-07 16:39:54 -04:00
pdf . flags | = 1 < < BAD_PDF_HEADERPOS ;
2017-09-21 14:26:37 -04:00
cli_dbgmsg ( " cli_pdf: PDF header is not at position 0: %lld \n " , ( long long ) ( pdfver - start + offset ) ) ;
2024-03-25 13:01:46 -04:00
2014-06-25 13:36:30 -04:00
if ( pdfobj )
cli_jsonbool ( pdfobj , " BadVersionLocation " , 1 ) ;
2010-05-10 11:57:44 +03:00
}
2014-04-07 16:39:54 -04:00
2010-05-10 11:57:44 +03:00
offset + = pdfver - start ;
/* find trailer and xref, don't fail if not found */
2010-09-28 12:42:41 +03:00
map_off = ( off_t ) map - > len - 2048 ;
2010-05-10 11:57:44 +03:00
if ( map_off < 0 )
2014-04-07 16:39:54 -04:00
map_off = 0 ;
2010-05-10 11:57:44 +03:00
bytesleft = map - > len - map_off ;
2014-04-07 16:39:54 -04:00
2010-05-10 11:57:44 +03:00
eofmap = fmap_need_off_once ( map , map_off , bytesleft ) ;
if ( ! eofmap ) {
2014-04-07 16:39:54 -04:00
cli_errmsg ( " cli_pdf: mmap() failed (2) \n " ) ;
2018-08-14 14:00:31 -07:00
rc = CL_EMAP ;
goto done ;
2010-05-10 11:57:44 +03:00
}
2014-04-07 16:39:54 -04:00
2010-05-10 11:57:44 +03:00
eof = eofmap + bytesleft ;
2018-12-03 12:40:13 -05:00
for ( q = & eofmap [ bytesleft - 5 ] ; q > eofmap ; q - - ) {
2014-04-07 16:39:54 -04:00
if ( memcmp ( q , " %%EOF " , 5 ) = = 0 )
break ;
2010-05-10 11:57:44 +03:00
}
2014-04-07 16:39:54 -04:00
2010-05-10 11:57:44 +03:00
if ( q < = eofmap ) {
2014-04-07 16:39:54 -04:00
pdf . flags | = 1 < < BAD_PDF_TRAILER ;
cli_dbgmsg ( " cli_pdf: %%%%EOF not found \n " ) ;
2024-03-25 13:01:46 -04:00
2014-06-25 16:26:33 -04:00
if ( pdfobj )
cli_jsonbool ( pdfobj , " NoEOF " , 1 ) ;
2010-05-10 11:57:44 +03:00
} else {
2014-04-07 16:39:54 -04:00
const char * t ;
/*size = q - eofmap + map_off;*/
q - = 9 ;
2018-12-03 12:40:13 -05:00
for ( ; q > eofmap ; q - - ) {
2014-04-07 16:39:54 -04:00
if ( memcmp ( q , " startxref " , 9 ) = = 0 )
break ;
}
if ( q < = eofmap ) {
pdf . flags | = 1 < < BAD_PDF_TRAILER ;
cli_dbgmsg ( " cli_pdf: startxref not found \n " ) ;
2024-03-25 13:01:46 -04:00
2014-06-25 16:26:33 -04:00
if ( pdfobj )
cli_jsonbool ( pdfobj , " NoXREF " , 1 ) ;
2014-04-07 16:39:54 -04:00
} else {
2018-12-03 12:40:13 -05:00
for ( t = q ; t > eofmap ; t - - ) {
if ( memcmp ( t , " trailer " , 7 ) = = 0 )
2014-04-07 16:39:54 -04:00
break ;
}
pdf_parse_trailer ( & pdf , eofmap , eof - eofmap ) ;
q + = 9 ;
2018-12-03 12:40:13 -05:00
while ( q < eof & & ( * q = = ' ' | | * q = = ' \n ' | | * q = = ' \r ' ) ) {
q + + ;
}
2014-04-07 16:39:54 -04:00
2019-01-22 14:15:46 -05:00
if ( CL_SUCCESS ! = cli_strntol_wrap ( q , q - eofmap + map_off , 0 , 10 , & temp_long ) ) {
2018-06-02 20:58:35 -04:00
cli_dbgmsg ( " cli_pdf: failed to parse PDF trailer xref \n " ) ;
2014-04-07 16:39:54 -04:00
pdf . flags | = 1 < < BAD_PDF_TRAILER ;
2019-01-22 14:15:46 -05:00
} else if ( temp_long < 0 ) {
cli_dbgmsg ( " cli_pdf: Encountered invalid negative PDF trailer xref (%ld). \n " , temp_long ) ;
pdf . flags | = 1 < < BAD_PDF_TRAILER ;
2018-12-03 12:40:13 -05:00
} else {
2019-01-22 14:15:46 -05:00
xref = ( unsigned long ) temp_long ;
2018-06-02 20:58:35 -04:00
bytesleft = map - > len - offset - xref ;
if ( bytesleft > 4096 )
bytesleft = 4096 ;
q = fmap_need_off_once ( map , offset + xref , bytesleft ) ;
2018-12-03 12:40:13 -05:00
if ( ! q | | xrefCheck ( q , q + bytesleft ) = = - 1 ) {
2018-06-02 20:58:35 -04:00
cli_dbgmsg ( " cli_pdf: did not find valid xref \n " ) ;
pdf . flags | = 1 < < BAD_PDF_TRAILER ;
}
}
2014-04-07 16:39:54 -04:00
}
2010-05-10 11:57:44 +03:00
}
2014-04-07 16:39:54 -04:00
size - = offset ;
2010-05-10 11:57:44 +03:00
pdf . size = size ;
2018-12-03 12:40:13 -05:00
pdf . map = fmap_need_off ( map , offset , size ) ;
2010-05-10 11:57:44 +03:00
if ( ! pdf . map ) {
2014-04-07 16:39:54 -04:00
cli_errmsg ( " cli_pdf: mmap() failed (3) \n " ) ;
2018-08-14 14:00:31 -07:00
rc = CL_EMAP ;
goto done ;
2010-05-10 11:57:44 +03:00
}
2014-04-07 16:39:54 -04:00
pdf . startoff = offset ;
2025-06-08 01:12:33 -04:00
rc = run_pdf_hooks ( & pdf , PDF_PHASE_PRE , - 1 , NULL ) ;
2022-08-16 18:55:06 -07:00
if ( CL_SUCCESS ! = rc ) {
2014-04-07 16:39:54 -04:00
cli_dbgmsg ( " cli_pdf: (pre hooks) returning %d \n " , rc ) ;
2014-06-30 14:35:42 -04:00
2018-08-14 14:00:31 -07:00
rc = rc = = CL_BREAK ? CL_CLEAN : rc ;
goto done ;
2013-06-20 13:43:46 -04:00
}
2018-08-14 14:00:31 -07:00
/*
2019-03-05 21:15:41 -05:00
* Find and extract all objects in the PDF .
2020-07-01 16:56:26 -07:00
* This methodology adds objects from object streams .
2018-08-14 14:00:31 -07:00
*/
objs_found = pdf . nobjs ;
2022-08-16 18:55:06 -07:00
rc = pdf_find_and_extract_objs ( & pdf ) ;
2014-06-30 14:35:42 -04:00
2020-01-31 11:52:00 -08:00
if ( CL_EMEM = = rc ) {
cli_dbgmsg ( " cli_pdf: pdf_find_and_extract_objs had an allocation failure \n " ) ;
goto err ;
} else if ( pdf . nobjs < = objs_found ) {
2018-08-14 14:00:31 -07:00
cli_dbgmsg ( " cli_pdf: pdf_find_and_extract_objs did not find any new objects! \n " ) ;
} else {
cli_dbgmsg ( " cli_pdf: pdf_find_and_extract_objs found %d new objects. \n " , pdf . nobjs - objs_found ) ;
2010-05-11 10:37:10 +03:00
}
2010-07-30 16:54:53 +03:00
if ( pdf . flags & ( 1 < < ENCRYPTED_PDF ) )
2018-12-03 12:40:13 -05:00
pdf . flags & = ~ ( ( 1 < < BAD_FLATESTART ) | ( 1 < < BAD_STREAMSTART ) | ( 1 < < BAD_ASCIIDECODE ) ) ;
2010-07-30 16:54:53 +03:00
2022-08-16 18:55:06 -07:00
if ( pdf . flags & & CL_SUCCESS = = rc ) {
2014-04-07 16:39:54 -04:00
cli_dbgmsg ( " cli_pdf: flags 0x%02x \n " , pdf . flags ) ;
2025-06-08 01:12:33 -04:00
rc = run_pdf_hooks ( & pdf , PDF_PHASE_END , - 1 , NULL ) ;
2014-04-07 16:39:54 -04:00
2022-08-16 18:55:06 -07:00
if ( CL_SUCCESS = = rc & & SCAN_HEURISTICS & & ( ctx - > dconf - > other & OTHER_CONF_PDFNAMEOBJ ) ) {
2013-06-20 13:43:46 -04:00
if ( pdf . flags & ( 1 < < ESCAPED_COMMON_PDFNAME ) ) {
/* for example /Fl#61te#44#65#63#6f#64#65 instead of /FlateDecode */
2022-08-16 18:55:06 -07:00
rc = cli_append_potentially_unwanted ( ctx , " Heuristics.PDF.ObfuscatedNameObject " ) ;
2013-06-20 13:43:46 -04:00
}
}
2010-07-30 14:24:52 +03:00
#if 0
2018-06-01 14:23:25 -04:00
/* TODO: find both trailers, and /Encrypt settings */
if ( pdf . flags & ( 1 < < LINEARIZED_PDF ) )
pdf . flags & = ~ ( 1 < < BAD_ASCIIDECODE ) ;
if ( pdf . flags & ( 1 < < MANY_FILTERS ) )
pdf . flags & = ~ ( 1 < < BAD_ASCIIDECODE ) ;
2022-08-16 18:55:06 -07:00
if ( CL_SUCCESS = = rc & & ( pdf . flags &
2018-06-01 14:23:25 -04:00
( ( 1 < < BAD_PDF_TOOMANYOBJS ) | ( 1 < < BAD_STREAM_FILTERS ) |
( 1 < < BAD_FLATE ) | ( 1 < < BAD_ASCIIDECODE ) |
( 1 < < UNTERMINATED_OBJ_DICT ) | ( 1 < < UNKNOWN_FILTER ) ) ) ) {
rc = CL_EUNPACK ;
}
2010-07-30 14:23:10 +03:00
# endif
2010-05-11 11:26:35 +03:00
}
2012-12-17 11:12:11 -05:00
2018-08-14 14:00:31 -07:00
done :
2022-08-16 18:55:06 -07:00
if ( CL_SUCCESS = = rc & & pdf . stats . ninvalidobjs > 0 ) {
2012-12-17 11:12:11 -05:00
rc = CL_EFORMAT ;
}
2022-05-31 19:15:06 -04:00
err :
2014-04-29 17:27:02 -04:00
pdf_export_json ( & pdf ) ;
2018-08-14 14:00:31 -07:00
if ( pdf . objstms ) {
for ( i = 0 ; i < pdf . nobjstms ; i + + ) {
if ( pdf . objstms [ i ] ) {
if ( pdf . objstms [ i ] - > streambuf ) {
free ( pdf . objstms [ i ] - > streambuf ) ;
pdf . objstms [ i ] - > streambuf = NULL ;
}
free ( pdf . objstms [ i ] ) ;
pdf . objstms [ i ] = NULL ;
}
}
free ( pdf . objstms ) ;
pdf . objstms = NULL ;
}
if ( NULL ! = pdf . objs ) {
for ( i = 0 ; i < pdf . nobjs ; i + + ) {
if ( NULL ! = pdf . objs [ i ] ) {
2023-04-19 16:10:16 -07:00
if ( NULL ! = pdf . objs [ i ] - > path ) {
free ( pdf . objs [ i ] - > path ) ;
pdf . objs [ i ] - > path = NULL ;
}
2018-08-14 14:00:31 -07:00
free ( pdf . objs [ i ] ) ;
pdf . objs [ i ] = NULL ;
}
}
free ( pdf . objs ) ;
pdf . objs = NULL ;
}
if ( pdf . fileID ) {
free ( pdf . fileID ) ;
pdf . fileID = NULL ;
}
if ( pdf . key ) {
free ( pdf . key ) ;
pdf . key = NULL ;
}
2012-12-17 11:12:11 -05:00
2010-11-15 23:27:10 +02:00
/* PDF hooks may abort, don't return CL_BREAK to caller! */
2018-08-14 14:00:31 -07:00
rc = ( rc = = CL_BREAK ) ? CL_CLEAN : rc ;
cli_dbgmsg ( " cli_pdf: returning %d \n " , rc ) ;
return rc ;
2010-05-10 11:57:44 +03:00
}
2018-08-14 14:00:31 -07:00
/**
* @ brief Skip the rest of the current line , and find the start of the next line .
2019-03-05 21:15:41 -05:00
*
2018-08-14 14:00:31 -07:00
* @ param ptr Current offset into buffer .
2019-03-05 21:15:41 -05:00
* @ param len Remaining bytes in buffer .
*
2018-08-14 14:00:31 -07:00
* @ return const char * Address of next line , or NULL if no next line in buffer .
2005-05-24 18:44:03 +00:00
*/
static const char *
pdf_nextlinestart ( const char * ptr , size_t len )
{
2019-01-22 18:04:53 -05:00
if ( ! ptr | | ( 0 = = len ) ) {
/* Invalid args */
return NULL ;
}
2018-12-03 12:40:13 -05:00
while ( strchr ( " \r \n " , * ptr ) = = NULL ) {
if ( - - len = = 0L )
2014-04-07 16:39:54 -04:00
return NULL ;
ptr + + ;
}
2018-12-03 12:40:13 -05:00
while ( strchr ( " \r \n " , * ptr ) ! = NULL ) {
if ( - - len = = 0L )
2014-04-07 16:39:54 -04:00
return NULL ;
ptr + + ;
}
return ptr ;
2005-05-24 18:44:03 +00:00
}
2005-05-27 14:44:00 +00:00
2018-08-14 14:00:31 -07:00
/**
* @ brief Return the start of the next PDF object .
2019-03-05 21:15:41 -05:00
*
2005-07-30 10:08:59 +00:00
* This assumes that we ' re not in a stream .
2019-03-05 21:15:41 -05:00
*
2018-08-14 14:00:31 -07:00
* @ param ptr Current offset into buffer .
2019-03-05 21:15:41 -05:00
* @ param len Remaining bytes in buffer .
*
* @ return const char * Address of next object in the buffer , or NULL if there is none in the buffer .
2005-07-30 10:08:59 +00:00
*/
static const char *
pdf_nextobject ( const char * ptr , size_t len )
{
2014-04-07 16:39:54 -04:00
const char * p ;
int inobject = 1 ;
2018-12-03 12:40:13 -05:00
while ( len ) {
switch ( * ptr ) {
case ' \n ' :
case ' \r ' :
case ' % ' : /* comment */
p = pdf_nextlinestart ( ptr , len ) ;
if ( p = = NULL )
return NULL ;
2014-04-07 16:39:54 -04:00
2018-12-03 12:40:13 -05:00
len - = ( size_t ) ( p - ptr ) ;
ptr = p ;
inobject = 0 ;
2014-04-07 16:39:54 -04:00
2018-12-03 12:40:13 -05:00
break ;
case ' ' :
case ' \t ' :
case ' [ ' : /* Start of an array object */
case ' \v ' :
case ' \f ' :
case ' < ' : /* Start of a dictionary object */
inobject = 0 ;
ptr + + ;
len - - ;
2014-04-07 16:39:54 -04:00
2018-12-03 12:40:13 -05:00
break ;
case ' / ' : /* Start of a name object */
2014-04-07 16:39:54 -04:00
return ptr ;
2018-12-03 12:40:13 -05:00
case ' ( ' : /* start of JS */
return ptr ;
default :
if ( ! inobject ) {
/* TODO: parse and return object type */
return ptr ;
}
2014-04-07 16:39:54 -04:00
2018-12-03 12:40:13 -05:00
ptr + + ;
len - - ;
2014-04-07 16:39:54 -04:00
}
}
return NULL ;
2005-05-27 14:44:00 +00:00
}
2014-04-16 14:23:16 -04:00
/* PDF statistics */
2014-06-25 14:06:17 -04:00
static void ASCIIHexDecode_cb ( struct pdf_struct * pdf , struct pdf_obj * obj , struct pdfname_action * act )
2014-04-16 14:23:16 -04:00
{
2014-07-10 18:11:49 -04:00
UNUSEDPARAM ( obj ) ;
UNUSEDPARAM ( act ) ;
2023-04-19 16:10:16 -07:00
if ( NULL = = pdf )
2014-04-16 14:23:16 -04:00
return ;
pdf - > stats . nasciihexdecode + + ;
}
2014-06-25 14:06:17 -04:00
static void ASCII85Decode_cb ( struct pdf_struct * pdf , struct pdf_obj * obj , struct pdfname_action * act )
2014-04-16 14:23:16 -04:00
{
2014-07-10 18:11:49 -04:00
UNUSEDPARAM ( obj ) ;
UNUSEDPARAM ( act ) ;
2023-04-19 16:10:16 -07:00
if ( NULL = = pdf )
2014-04-16 14:23:16 -04:00
return ;
pdf - > stats . nascii85decode + + ;
}
2014-06-25 14:06:17 -04:00
static void EmbeddedFile_cb ( struct pdf_struct * pdf , struct pdf_obj * obj , struct pdfname_action * act )
2014-04-16 14:23:16 -04:00
{
2014-07-10 18:11:49 -04:00
UNUSEDPARAM ( obj ) ;
UNUSEDPARAM ( act ) ;
2023-04-19 16:10:16 -07:00
if ( NULL = = pdf )
2014-04-16 14:23:16 -04:00
return ;
pdf - > stats . nembeddedfile + + ;
}
2014-06-25 14:06:17 -04:00
static void FlateDecode_cb ( struct pdf_struct * pdf , struct pdf_obj * obj , struct pdfname_action * act )
2014-04-16 14:23:16 -04:00
{
2014-07-10 18:11:49 -04:00
UNUSEDPARAM ( obj ) ;
UNUSEDPARAM ( act ) ;
2023-04-19 16:10:16 -07:00
if ( NULL = = pdf )
2014-04-16 14:23:16 -04:00
return ;
pdf - > stats . nflate + + ;
}
2014-06-25 14:06:17 -04:00
static void Image_cb ( struct pdf_struct * pdf , struct pdf_obj * obj , struct pdfname_action * act )
2014-04-16 14:23:16 -04:00
{
2014-07-10 18:11:49 -04:00
UNUSEDPARAM ( obj ) ;
UNUSEDPARAM ( act ) ;
2023-04-19 16:10:16 -07:00
if ( NULL = = pdf )
2014-04-16 14:23:16 -04:00
return ;
pdf - > stats . nimage + + ;
}
2014-06-25 14:06:17 -04:00
static void LZWDecode_cb ( struct pdf_struct * pdf , struct pdf_obj * obj , struct pdfname_action * act )
2014-04-16 14:23:16 -04:00
{
2014-07-10 18:11:49 -04:00
UNUSEDPARAM ( obj ) ;
UNUSEDPARAM ( act ) ;
2023-04-19 16:10:16 -07:00
if ( NULL = = pdf )
2014-04-16 14:23:16 -04:00
return ;
pdf - > stats . nlzw + + ;
}
2014-06-25 14:06:17 -04:00
static void RunLengthDecode_cb ( struct pdf_struct * pdf , struct pdf_obj * obj , struct pdfname_action * act )
2014-04-16 14:23:16 -04:00
{
2014-07-10 18:11:49 -04:00
UNUSEDPARAM ( obj ) ;
UNUSEDPARAM ( act ) ;
2023-04-19 16:10:16 -07:00
if ( NULL = = pdf )
2014-04-16 14:23:16 -04:00
return ;
pdf - > stats . nrunlengthdecode + + ;
}
2014-06-25 14:06:17 -04:00
static void CCITTFaxDecode_cb ( struct pdf_struct * pdf , struct pdf_obj * obj , struct pdfname_action * act )
2014-04-16 14:23:16 -04:00
{
2014-07-10 18:11:49 -04:00
UNUSEDPARAM ( obj ) ;
UNUSEDPARAM ( act ) ;
2023-04-19 16:10:16 -07:00
if ( NULL = = pdf )
2014-04-16 14:23:16 -04:00
return ;
pdf - > stats . nfaxdecode + + ;
}
2014-06-25 14:06:17 -04:00
static void JBIG2Decode_cb ( struct pdf_struct * pdf , struct pdf_obj * obj , struct pdfname_action * act )
2014-04-16 14:23:16 -04:00
{
2023-04-19 16:10:16 -07:00
cli_ctx * ctx = NULL ;
2014-07-10 18:11:49 -04:00
struct json_object * pdfobj , * jbig2arr ;
UNUSEDPARAM ( obj ) ;
UNUSEDPARAM ( act ) ;
2014-04-16 14:23:16 -04:00
2023-04-19 16:10:16 -07:00
if ( NULL = = pdf )
2014-06-13 20:40:46 -04:00
return ;
2023-04-19 16:10:16 -07:00
ctx = pdf - > ctx ;
2018-07-20 22:28:48 -04:00
if ( ! ( SCAN_COLLECT_METADATA ) )
2014-07-01 10:50:08 -04:00
return ;
2025-06-08 01:12:33 -04:00
if ( ! ( pdf - > ctx - > this_layer_metadata_json ) )
2014-06-13 20:40:46 -04:00
return ;
2025-06-08 01:12:33 -04:00
pdfobj = cli_jsonobj ( pdf - > ctx - > this_layer_metadata_json , " PDFStats " ) ;
2014-06-13 20:40:46 -04:00
if ( ! ( pdfobj ) )
return ;
jbig2arr = cli_jsonarray ( pdfobj , " JBIG2Objects " ) ;
if ( ! ( jbig2arr ) )
return ;
2018-12-03 12:40:13 -05:00
cli_jsonint_array ( jbig2arr , obj - > id > > 8 ) ;
2014-06-13 20:40:46 -04:00
pdf - > stats . njbig2decode + + ;
2014-04-16 14:23:16 -04:00
}
2014-06-25 14:06:17 -04:00
static void DCTDecode_cb ( struct pdf_struct * pdf , struct pdf_obj * obj , struct pdfname_action * act )
2014-04-16 14:23:16 -04:00
{
2014-07-10 18:11:49 -04:00
UNUSEDPARAM ( obj ) ;
UNUSEDPARAM ( act ) ;
2023-04-19 16:10:16 -07:00
if ( NULL = = pdf )
2014-04-16 14:23:16 -04:00
return ;
pdf - > stats . ndctdecode + + ;
}
2014-06-25 14:06:17 -04:00
static void JPXDecode_cb ( struct pdf_struct * pdf , struct pdf_obj * obj , struct pdfname_action * act )
2014-04-16 14:23:16 -04:00
{
2014-07-10 18:11:49 -04:00
UNUSEDPARAM ( obj ) ;
UNUSEDPARAM ( act ) ;
2023-04-19 16:10:16 -07:00
if ( NULL = = pdf )
2014-04-16 14:23:16 -04:00
return ;
pdf - > stats . njpxdecode + + ;
}
2014-06-25 14:06:17 -04:00
static void Crypt_cb ( struct pdf_struct * pdf , struct pdf_obj * obj , struct pdfname_action * act )
2014-04-16 14:23:16 -04:00
{
2014-07-10 18:11:49 -04:00
UNUSEDPARAM ( obj ) ;
UNUSEDPARAM ( act ) ;
2023-04-19 16:10:16 -07:00
if ( NULL = = pdf )
2014-04-16 14:23:16 -04:00
return ;
pdf - > stats . ncrypt + + ;
}
2014-06-25 14:06:17 -04:00
static void Standard_cb ( struct pdf_struct * pdf , struct pdf_obj * obj , struct pdfname_action * act )
2014-04-16 14:23:16 -04:00
{
2014-07-10 18:11:49 -04:00
UNUSEDPARAM ( obj ) ;
UNUSEDPARAM ( act ) ;
2023-04-19 16:10:16 -07:00
if ( NULL = = pdf )
2014-04-16 14:23:16 -04:00
return ;
pdf - > stats . nstandard + + ;
}
2014-06-25 14:06:17 -04:00
static void Sig_cb ( struct pdf_struct * pdf , struct pdf_obj * obj , struct pdfname_action * act )
2014-04-16 14:23:16 -04:00
{
2014-07-10 18:11:49 -04:00
UNUSEDPARAM ( obj ) ;
UNUSEDPARAM ( act ) ;
2023-04-19 16:10:16 -07:00
if ( NULL = = pdf )
2014-04-16 14:23:16 -04:00
return ;
pdf - > stats . nsigned + + ;
}
2014-06-25 14:06:17 -04:00
static void JavaScript_cb ( struct pdf_struct * pdf , struct pdf_obj * obj , struct pdfname_action * act )
2014-04-16 14:23:16 -04:00
{
2020-07-01 16:56:26 -07:00
UNUSEDPARAM ( pdf ) ;
UNUSEDPARAM ( obj ) ;
2014-07-10 18:11:49 -04:00
UNUSEDPARAM ( act ) ;
2014-06-17 16:42:58 -04:00
2020-07-01 16:56:26 -07:00
/*
* Don ' t record the pdf - > stats or JSON now , we ' ll look for the actual
* Javascript in the object when we extract it later . This is to prevent
* false positives when objects reference an indirect object which doesn ' t
* actually have any content .
*/
2014-04-16 14:23:16 -04:00
}
2014-06-25 14:06:17 -04:00
static void OpenAction_cb ( struct pdf_struct * pdf , struct pdf_obj * obj , struct pdfname_action * act )
2014-04-16 14:23:16 -04:00
{
2014-07-10 18:11:49 -04:00
UNUSEDPARAM ( obj ) ;
UNUSEDPARAM ( act ) ;
2023-04-19 16:10:16 -07:00
if ( NULL = = pdf )
2014-04-16 14:23:16 -04:00
return ;
pdf - > stats . nopenaction + + ;
}
2014-06-25 14:06:17 -04:00
static void Launch_cb ( struct pdf_struct * pdf , struct pdf_obj * obj , struct pdfname_action * act )
2014-04-16 14:23:16 -04:00
{
2014-07-10 18:11:49 -04:00
UNUSEDPARAM ( obj ) ;
UNUSEDPARAM ( act ) ;
2023-04-19 16:10:16 -07:00
if ( NULL = = pdf )
2014-04-16 14:23:16 -04:00
return ;
pdf - > stats . nlaunch + + ;
}
2014-06-25 14:06:17 -04:00
static void Page_cb ( struct pdf_struct * pdf , struct pdf_obj * obj , struct pdfname_action * act )
2014-04-16 14:23:16 -04:00
{
2014-07-10 18:11:49 -04:00
UNUSEDPARAM ( obj ) ;
UNUSEDPARAM ( act ) ;
2023-04-19 16:10:16 -07:00
if ( NULL = = pdf )
2014-04-16 14:23:16 -04:00
return ;
pdf - > stats . npage + + ;
}
2014-06-25 14:06:17 -04:00
static void Author_cb ( struct pdf_struct * pdf , struct pdf_obj * obj , struct pdfname_action * act )
2014-05-23 14:06:35 -04:00
{
2023-04-19 16:10:16 -07:00
cli_ctx * ctx = NULL ;
2018-07-20 22:28:48 -04:00
2014-07-10 18:11:49 -04:00
UNUSEDPARAM ( act ) ;
2023-04-19 16:10:16 -07:00
if ( NULL = = pdf )
2014-05-23 14:06:35 -04:00
return ;
2023-04-19 16:10:16 -07:00
ctx = pdf - > ctx ;
2018-07-20 22:28:48 -04:00
if ( ! ( SCAN_COLLECT_METADATA ) )
2014-07-01 10:50:08 -04:00
return ;
2015-04-01 17:41:59 -04:00
if ( ! ( pdf - > stats . author ) ) {
2018-08-14 14:00:31 -07:00
const char * objstart = ( obj - > objstm ) ? ( const char * ) ( obj - > start + obj - > objstm - > streambuf )
: ( const char * ) ( obj - > start + pdf - > map ) ;
2022-05-08 14:59:09 -07:00
pdf - > stats . author = calloc ( 1 , sizeof ( struct pdf_stats_entry ) ) ;
2015-04-01 17:41:59 -04:00
if ( ! ( pdf - > stats . author ) )
return ;
2022-07-01 17:32:00 -07:00
pdf - > parse_recursion_depth + + ;
2019-03-05 21:15:41 -05:00
pdf - > stats . author - > data = pdf_parse_string ( pdf , obj , objstart , obj - > size , " /Author " , NULL , & ( pdf - > stats . author - > meta ) ) ;
2022-07-01 17:32:00 -07:00
pdf - > parse_recursion_depth - - ;
2015-04-01 17:41:59 -04:00
}
2014-05-23 14:06:35 -04:00
}
2014-06-25 14:06:17 -04:00
static void Creator_cb ( struct pdf_struct * pdf , struct pdf_obj * obj , struct pdfname_action * act )
2014-05-23 14:06:35 -04:00
{
2023-04-19 16:10:16 -07:00
cli_ctx * ctx = NULL ;
2018-07-20 22:28:48 -04:00
2014-07-10 18:11:49 -04:00
UNUSEDPARAM ( act ) ;
2023-04-19 16:10:16 -07:00
if ( NULL = = pdf )
2014-05-23 14:06:35 -04:00
return ;
2023-04-19 16:10:16 -07:00
ctx = pdf - > ctx ;
2018-07-20 22:28:48 -04:00
if ( ! ( SCAN_COLLECT_METADATA ) )
2014-07-01 10:50:08 -04:00
return ;
2015-04-01 17:41:59 -04:00
if ( ! ( pdf - > stats . creator ) ) {
2018-08-14 14:00:31 -07:00
const char * objstart = ( obj - > objstm ) ? ( const char * ) ( obj - > start + obj - > objstm - > streambuf )
: ( const char * ) ( obj - > start + pdf - > map ) ;
2022-05-08 14:59:09 -07:00
pdf - > stats . creator = calloc ( 1 , sizeof ( struct pdf_stats_entry ) ) ;
2015-04-01 17:41:59 -04:00
if ( ! ( pdf - > stats . creator ) )
return ;
2022-07-01 17:32:00 -07:00
pdf - > parse_recursion_depth + + ;
2019-03-05 21:15:41 -05:00
pdf - > stats . creator - > data = pdf_parse_string ( pdf , obj , objstart , obj - > size , " /Creator " , NULL , & ( pdf - > stats . creator - > meta ) ) ;
2022-07-01 17:32:00 -07:00
pdf - > parse_recursion_depth - - ;
2015-04-01 17:41:59 -04:00
}
2014-05-23 14:06:35 -04:00
}
2014-06-25 14:06:17 -04:00
static void ModificationDate_cb ( struct pdf_struct * pdf , struct pdf_obj * obj , struct pdfname_action * act )
2014-05-23 14:06:35 -04:00
{
2023-04-19 16:10:16 -07:00
cli_ctx * ctx = NULL ;
2018-07-20 22:28:48 -04:00
2014-07-10 18:11:49 -04:00
UNUSEDPARAM ( act ) ;
2023-04-19 16:10:16 -07:00
if ( NULL = = pdf )
2014-05-23 14:06:35 -04:00
return ;
2023-04-19 16:10:16 -07:00
ctx = pdf - > ctx ;
2018-07-20 22:28:48 -04:00
if ( ! ( SCAN_COLLECT_METADATA ) )
2014-07-01 10:50:08 -04:00
return ;
2015-04-01 17:41:59 -04:00
if ( ! ( pdf - > stats . modificationdate ) ) {
2018-08-14 14:00:31 -07:00
const char * objstart = ( obj - > objstm ) ? ( const char * ) ( obj - > start + obj - > objstm - > streambuf )
: ( const char * ) ( obj - > start + pdf - > map ) ;
2022-05-08 14:59:09 -07:00
pdf - > stats . modificationdate = calloc ( 1 , sizeof ( struct pdf_stats_entry ) ) ;
2015-04-01 17:41:59 -04:00
if ( ! ( pdf - > stats . modificationdate ) )
return ;
2022-07-01 17:32:00 -07:00
pdf - > parse_recursion_depth + + ;
2019-03-05 21:15:41 -05:00
pdf - > stats . modificationdate - > data = pdf_parse_string ( pdf , obj , objstart , obj - > size , " /ModDate " , NULL , & ( pdf - > stats . modificationdate - > meta ) ) ;
2022-07-01 17:32:00 -07:00
pdf - > parse_recursion_depth - - ;
2015-04-01 17:41:59 -04:00
}
2014-05-23 14:06:35 -04:00
}
2014-06-25 14:06:17 -04:00
static void CreationDate_cb ( struct pdf_struct * pdf , struct pdf_obj * obj , struct pdfname_action * act )
2014-05-23 14:06:35 -04:00
{
2023-04-19 16:10:16 -07:00
cli_ctx * ctx = NULL ;
2018-07-20 22:28:48 -04:00
2014-07-10 18:11:49 -04:00
UNUSEDPARAM ( act ) ;
2023-04-19 16:10:16 -07:00
if ( NULL = = pdf )
2014-05-23 14:06:35 -04:00
return ;
2023-04-19 16:10:16 -07:00
ctx = pdf - > ctx ;
2018-07-20 22:28:48 -04:00
if ( ! ( SCAN_COLLECT_METADATA ) )
2014-07-01 10:50:08 -04:00
return ;
2015-04-01 17:41:59 -04:00
if ( ! ( pdf - > stats . creationdate ) ) {
2018-08-14 14:00:31 -07:00
const char * objstart = ( obj - > objstm ) ? ( const char * ) ( obj - > start + obj - > objstm - > streambuf )
: ( const char * ) ( obj - > start + pdf - > map ) ;
2022-05-08 14:59:09 -07:00
pdf - > stats . creationdate = calloc ( 1 , sizeof ( struct pdf_stats_entry ) ) ;
2015-04-01 17:41:59 -04:00
if ( ! ( pdf - > stats . creationdate ) )
return ;
2022-07-01 17:32:00 -07:00
pdf - > parse_recursion_depth + + ;
2019-03-05 21:15:41 -05:00
pdf - > stats . creationdate - > data = pdf_parse_string ( pdf , obj , objstart , obj - > size , " /CreationDate " , NULL , & ( pdf - > stats . creationdate - > meta ) ) ;
2022-07-01 17:32:00 -07:00
pdf - > parse_recursion_depth - - ;
2015-04-01 17:41:59 -04:00
}
2014-05-23 14:06:35 -04:00
}
2014-06-25 14:06:17 -04:00
static void Producer_cb ( struct pdf_struct * pdf , struct pdf_obj * obj , struct pdfname_action * act )
2014-05-23 14:06:35 -04:00
{
2023-04-19 16:10:16 -07:00
cli_ctx * ctx = NULL ;
2018-07-20 22:28:48 -04:00
2014-07-10 18:11:49 -04:00
UNUSEDPARAM ( act ) ;
2023-04-19 16:10:16 -07:00
if ( NULL = = pdf )
2014-05-23 14:06:35 -04:00
return ;
2023-04-19 16:10:16 -07:00
ctx = pdf - > ctx ;
2018-07-20 22:28:48 -04:00
if ( ! ( SCAN_COLLECT_METADATA ) )
2014-07-01 10:50:08 -04:00
return ;
2015-04-01 17:41:59 -04:00
if ( ! ( pdf - > stats . producer ) ) {
2018-08-14 14:00:31 -07:00
const char * objstart = ( obj - > objstm ) ? ( const char * ) ( obj - > start + obj - > objstm - > streambuf )
: ( const char * ) ( obj - > start + pdf - > map ) ;
2022-05-08 14:59:09 -07:00
pdf - > stats . producer = calloc ( 1 , sizeof ( struct pdf_stats_entry ) ) ;
2015-04-01 17:41:59 -04:00
if ( ! ( pdf - > stats . producer ) )
return ;
2022-07-01 17:32:00 -07:00
pdf - > parse_recursion_depth + + ;
2019-03-05 21:15:41 -05:00
pdf - > stats . producer - > data = pdf_parse_string ( pdf , obj , objstart , obj - > size , " /Producer " , NULL , & ( pdf - > stats . producer - > meta ) ) ;
2022-07-01 17:32:00 -07:00
pdf - > parse_recursion_depth - - ;
2015-04-01 17:41:59 -04:00
}
2014-05-23 14:06:35 -04:00
}
2014-06-25 14:06:17 -04:00
static void Title_cb ( struct pdf_struct * pdf , struct pdf_obj * obj , struct pdfname_action * act )
2014-06-13 11:18:07 -04:00
{
2023-04-19 16:10:16 -07:00
cli_ctx * ctx = NULL ;
2018-07-20 22:28:48 -04:00
2014-07-10 18:11:49 -04:00
UNUSEDPARAM ( act ) ;
2023-04-19 16:10:16 -07:00
if ( NULL = = pdf )
2014-06-13 11:18:07 -04:00
return ;
2023-04-19 16:10:16 -07:00
ctx = pdf - > ctx ;
2018-07-20 22:28:48 -04:00
if ( ! ( SCAN_COLLECT_METADATA ) )
2014-07-01 10:50:08 -04:00
return ;
2015-04-01 17:41:59 -04:00
if ( ! ( pdf - > stats . title ) ) {
2018-08-14 14:00:31 -07:00
const char * objstart = ( obj - > objstm ) ? ( const char * ) ( obj - > start + obj - > objstm - > streambuf )
: ( const char * ) ( obj - > start + pdf - > map ) ;
2022-05-08 14:59:09 -07:00
pdf - > stats . title = calloc ( 1 , sizeof ( struct pdf_stats_entry ) ) ;
2015-04-01 17:41:59 -04:00
if ( ! ( pdf - > stats . title ) )
return ;
2022-07-01 17:32:00 -07:00
pdf - > parse_recursion_depth + + ;
2019-03-05 21:15:41 -05:00
pdf - > stats . title - > data = pdf_parse_string ( pdf , obj , objstart , obj - > size , " /Title " , NULL , & ( pdf - > stats . title - > meta ) ) ;
2022-07-01 17:32:00 -07:00
pdf - > parse_recursion_depth - - ;
2015-04-01 17:41:59 -04:00
}
2014-06-13 11:18:07 -04:00
}
2014-06-25 14:06:17 -04:00
static void Keywords_cb ( struct pdf_struct * pdf , struct pdf_obj * obj , struct pdfname_action * act )
2014-06-13 11:18:07 -04:00
{
2023-04-19 16:10:16 -07:00
cli_ctx * ctx = NULL ;
2018-07-20 22:28:48 -04:00
2014-07-10 18:11:49 -04:00
UNUSEDPARAM ( act ) ;
2023-04-19 16:10:16 -07:00
if ( NULL = = pdf )
2014-06-13 11:18:07 -04:00
return ;
2023-04-19 16:10:16 -07:00
ctx = pdf - > ctx ;
2018-07-20 22:28:48 -04:00
if ( ! ( SCAN_COLLECT_METADATA ) )
2014-07-01 10:50:08 -04:00
return ;
2015-04-01 17:41:59 -04:00
if ( ! ( pdf - > stats . keywords ) ) {
2018-08-14 14:00:31 -07:00
const char * objstart = ( obj - > objstm ) ? ( const char * ) ( obj - > start + obj - > objstm - > streambuf )
: ( const char * ) ( obj - > start + pdf - > map ) ;
2022-05-08 14:59:09 -07:00
pdf - > stats . keywords = calloc ( 1 , sizeof ( struct pdf_stats_entry ) ) ;
2015-04-01 17:41:59 -04:00
if ( ! ( pdf - > stats . keywords ) )
return ;
2022-07-01 17:32:00 -07:00
pdf - > parse_recursion_depth + + ;
2019-03-05 21:15:41 -05:00
pdf - > stats . keywords - > data = pdf_parse_string ( pdf , obj , objstart , obj - > size , " /Keywords " , NULL , & ( pdf - > stats . keywords - > meta ) ) ;
2022-07-01 17:32:00 -07:00
pdf - > parse_recursion_depth - - ;
2015-04-01 17:41:59 -04:00
}
2014-06-13 11:18:07 -04:00
}
2014-06-25 14:06:17 -04:00
static void Subject_cb ( struct pdf_struct * pdf , struct pdf_obj * obj , struct pdfname_action * act )
2014-06-13 11:18:07 -04:00
{
2023-04-19 16:10:16 -07:00
cli_ctx * ctx = NULL ;
2018-07-20 22:28:48 -04:00
2014-07-10 18:11:49 -04:00
UNUSEDPARAM ( act ) ;
2023-04-19 16:10:16 -07:00
if ( NULL = = pdf )
2014-06-13 11:18:07 -04:00
return ;
2023-04-19 16:10:16 -07:00
ctx = pdf - > ctx ;
2018-07-20 22:28:48 -04:00
if ( ! ( SCAN_COLLECT_METADATA ) )
2014-07-01 10:50:08 -04:00
return ;
2015-04-01 17:41:59 -04:00
if ( ! ( pdf - > stats . subject ) ) {
2018-08-14 14:00:31 -07:00
const char * objstart = ( obj - > objstm ) ? ( const char * ) ( obj - > start + obj - > objstm - > streambuf )
: ( const char * ) ( obj - > start + pdf - > map ) ;
2022-05-08 14:59:09 -07:00
pdf - > stats . subject = calloc ( 1 , sizeof ( struct pdf_stats_entry ) ) ;
2015-04-01 17:41:59 -04:00
if ( ! ( pdf - > stats . subject ) )
return ;
2022-07-01 17:32:00 -07:00
pdf - > parse_recursion_depth + + ;
2019-03-05 21:15:41 -05:00
pdf - > stats . subject - > data = pdf_parse_string ( pdf , obj , objstart , obj - > size , " /Subject " , NULL , & ( pdf - > stats . subject - > meta ) ) ;
2022-07-01 17:32:00 -07:00
pdf - > parse_recursion_depth - - ;
2015-04-01 17:41:59 -04:00
}
2014-06-13 11:18:07 -04:00
}
2014-06-27 12:43:23 -04:00
static void RichMedia_cb ( struct pdf_struct * pdf , struct pdf_obj * obj , struct pdfname_action * act )
{
2014-07-10 18:11:49 -04:00
UNUSEDPARAM ( obj ) ;
UNUSEDPARAM ( act ) ;
2023-04-19 16:10:16 -07:00
if ( NULL = = pdf )
2014-06-27 12:43:23 -04:00
return ;
pdf - > stats . nrichmedia + + ;
}
static void AcroForm_cb ( struct pdf_struct * pdf , struct pdf_obj * obj , struct pdfname_action * act )
{
2014-07-10 18:11:49 -04:00
UNUSEDPARAM ( obj ) ;
UNUSEDPARAM ( act ) ;
2023-04-19 16:10:16 -07:00
if ( NULL = = pdf )
2014-06-27 12:43:23 -04:00
return ;
pdf - > stats . nacroform + + ;
}
static void XFA_cb ( struct pdf_struct * pdf , struct pdf_obj * obj , struct pdfname_action * act )
{
2014-07-10 18:11:49 -04:00
UNUSEDPARAM ( obj ) ;
UNUSEDPARAM ( act ) ;
2023-04-19 16:10:16 -07:00
if ( NULL = = pdf )
2014-06-27 12:43:23 -04:00
return ;
pdf - > stats . nxfa + + ;
}
2014-06-25 14:06:17 -04:00
static void Pages_cb ( struct pdf_struct * pdf , struct pdf_obj * obj , struct pdfname_action * act )
2014-06-19 17:41:15 -04:00
{
2023-04-19 16:10:16 -07:00
cli_ctx * ctx = NULL ;
2014-06-23 17:41:28 -04:00
struct pdf_array * array ;
2018-08-14 14:00:31 -07:00
const char * objstart = ( obj - > objstm ) ? ( const char * ) ( obj - > start + obj - > objstm - > streambuf )
: ( const char * ) ( obj - > start + pdf - > map ) ;
2014-06-23 17:41:28 -04:00
const char * begin ;
2018-12-03 12:40:13 -05:00
unsigned long npages = 0 , count ;
2019-01-22 14:15:46 -05:00
long temp_long ;
2014-06-23 17:41:28 -04:00
struct pdf_array_node * node ;
json_object * pdfobj ;
2018-10-25 13:06:15 -07:00
size_t countsize = 0 ;
2014-06-23 17:41:28 -04:00
2014-07-10 18:11:49 -04:00
UNUSEDPARAM ( act ) ;
2025-06-08 01:12:33 -04:00
if ( ! ( pdf ) | | ! ( pdf - > ctx - > this_layer_metadata_json ) )
2014-06-23 17:41:28 -04:00
return ;
2023-04-19 16:10:16 -07:00
ctx = pdf - > ctx ;
2018-07-20 22:28:48 -04:00
if ( ! ( SCAN_COLLECT_METADATA ) )
2014-07-01 10:50:08 -04:00
return ;
2025-06-08 01:12:33 -04:00
pdfobj = cli_jsonobj ( pdf - > ctx - > this_layer_metadata_json , " PDFStats " ) ;
2014-06-23 17:41:28 -04:00
if ( ! ( pdfobj ) )
return ;
2019-03-05 21:15:41 -05:00
begin = cli_memstr ( objstart , obj - > size , " /Kids " , 5 ) ;
2014-06-23 17:41:28 -04:00
if ( ! ( begin ) )
return ;
begin + = 5 ;
2022-07-01 17:32:00 -07:00
pdf - > parse_recursion_depth + + ;
2019-03-05 21:15:41 -05:00
array = pdf_parse_array ( pdf , obj , obj - > size , ( char * ) begin , NULL ) ;
2022-07-01 17:32:00 -07:00
pdf - > parse_recursion_depth - - ;
2014-06-24 18:43:51 -04:00
if ( ! ( array ) ) {
cli_jsonbool ( pdfobj , " IncorrectPagesCount " , 1 ) ;
2014-06-24 10:43:15 -04:00
return ;
2014-06-24 18:43:51 -04:00
}
2014-06-24 10:43:15 -04:00
for ( node = array - > nodes ; node ! = NULL ; node = node - > next )
if ( node - > datasz )
2014-06-24 18:43:51 -04:00
if ( strchr ( ( char * ) ( node - > data ) , ' R ' ) )
2014-06-24 10:43:15 -04:00
npages + + ;
2014-06-23 17:41:28 -04:00
2019-03-05 21:15:41 -05:00
begin = cli_memstr ( objstart , obj - > size , " /Count " , 6 ) ;
2014-06-23 17:41:28 -04:00
if ( ! ( begin ) ) {
cli_jsonbool ( pdfobj , " IncorrectPagesCount " , 1 ) ;
goto cleanup ;
}
begin + = 6 ;
2019-03-05 21:15:41 -05:00
while ( ( ( size_t ) ( begin - objstart ) < obj - > size ) & & isspace ( begin [ 0 ] ) )
2014-06-23 17:41:28 -04:00
begin + + ;
2019-03-05 21:15:41 -05:00
if ( ( size_t ) ( begin - objstart ) > = obj - > size ) {
2014-06-23 17:41:28 -04:00
goto cleanup ;
}
2019-03-05 21:15:41 -05:00
countsize = ( obj - > objstm ) ? ( size_t ) ( obj - > start + obj - > objstm - > streambuf + obj - > size - begin )
: ( size_t ) ( obj - > start + pdf - > map + obj - > size - begin ) ;
2018-10-25 13:06:15 -07:00
2019-01-22 14:15:46 -05:00
if ( CL_SUCCESS ! = cli_strntol_wrap ( begin , countsize , 0 , 10 , & temp_long ) ) {
cli_jsonbool ( pdfobj , " IncorrectPagesCount " , 1 ) ;
} else if ( temp_long < 0 ) {
2014-06-23 17:41:28 -04:00
cli_jsonbool ( pdfobj , " IncorrectPagesCount " , 1 ) ;
2019-01-22 14:15:46 -05:00
} else {
count = ( unsigned long ) temp_long ;
if ( count ! = npages ) {
cli_jsonbool ( pdfobj , " IncorrectPagesCount " , 1 ) ;
}
2018-06-02 20:58:35 -04:00
}
2014-06-23 17:41:28 -04:00
cleanup :
pdf_free_array ( array ) ;
2014-06-19 17:41:15 -04:00
}
2014-06-25 14:06:17 -04:00
static void Colors_cb ( struct pdf_struct * pdf , struct pdf_obj * obj , struct pdfname_action * act )
2014-06-19 17:41:15 -04:00
{
2023-04-19 16:10:16 -07:00
cli_ctx * ctx = NULL ;
2014-06-19 17:41:15 -04:00
json_object * colorsobj , * pdfobj ;
unsigned long ncolors ;
2019-01-22 14:15:46 -05:00
long temp_long ;
2018-08-14 14:00:31 -07:00
char * p1 ;
const char * objstart = ( obj - > objstm ) ? ( const char * ) ( obj - > start + obj - > objstm - > streambuf )
: ( const char * ) ( obj - > start + pdf - > map ) ;
2014-07-10 18:11:49 -04:00
UNUSEDPARAM ( act ) ;
2014-06-19 17:41:15 -04:00
2025-06-08 01:12:33 -04:00
if ( ! ( pdf ) | | ! ( pdf - > ctx ) | | ! ( pdf - > ctx - > this_layer_metadata_json ) )
2014-06-19 17:41:15 -04:00
return ;
2023-04-19 16:10:16 -07:00
ctx = pdf - > ctx ;
2018-07-20 22:28:48 -04:00
if ( ! ( SCAN_COLLECT_METADATA ) )
2014-07-01 10:50:08 -04:00
return ;
2019-03-05 21:15:41 -05:00
p1 = ( char * ) cli_memstr ( objstart , obj - > size , " /Colors " , 7 ) ;
2014-06-19 17:41:15 -04:00
if ( ! ( p1 ) )
return ;
p1 + = 7 ;
/* Ensure that we have at least one whitespace character plus at least one number */
2019-03-05 21:15:41 -05:00
if ( obj - > size - ( size_t ) ( p1 - objstart ) < 2 )
2014-06-19 17:41:15 -04:00
return ;
2019-03-05 21:15:41 -05:00
while ( ( ( size_t ) ( p1 - objstart ) < obj - > size ) & & isspace ( p1 [ 0 ] ) )
2014-06-19 17:41:15 -04:00
p1 + + ;
2019-03-05 21:15:41 -05:00
if ( ( size_t ) ( p1 - objstart ) = = obj - > size )
2014-06-19 17:41:15 -04:00
return ;
2019-03-05 21:15:41 -05:00
if ( CL_SUCCESS ! = cli_strntol_wrap ( p1 , ( size_t ) ( ( p1 - objstart ) - obj - > size ) , 0 , 10 , & temp_long ) ) {
2019-01-22 14:15:46 -05:00
return ;
} else if ( temp_long < 0 ) {
2018-06-02 20:58:35 -04:00
return ;
2019-01-22 14:15:46 -05:00
}
ncolors = ( unsigned long ) temp_long ;
2014-06-19 17:41:15 -04:00
/* We only care if the number of colors > 2**24 */
2018-12-03 12:40:13 -05:00
if ( ncolors < 1 < < 24 )
2014-06-19 17:41:15 -04:00
return ;
2025-06-08 01:12:33 -04:00
pdfobj = cli_jsonobj ( pdf - > ctx - > this_layer_metadata_json , " PDFStats " ) ;
2014-06-19 17:41:15 -04:00
if ( ! ( pdfobj ) )
return ;
colorsobj = cli_jsonarray ( pdfobj , " BigColors " ) ;
if ( ! ( colorsobj ) )
return ;
2018-12-03 12:40:13 -05:00
cli_jsonint_array ( colorsobj , obj - > id > > 8 ) ;
2014-06-19 17:41:15 -04:00
}
2025-04-07 16:50:09 -07:00
static void URI_cb ( struct pdf_struct * pdf , struct pdf_obj * obj , struct pdfname_action * act )
{
cli_ctx * ctx = NULL ;
off_t bytesleft = 0 ;
char * uri_start = NULL ;
char * uri_heap = NULL ;
const char * objstart = NULL ;
json_object * uriarr = NULL ;
UNUSEDPARAM ( act ) ;
2025-06-08 01:12:33 -04:00
if ( ! ( pdf ) | | ! ( pdf - > ctx ) | | ! ( pdf - > ctx - > this_layer_metadata_json ) | | ! obj ) {
2025-04-07 16:50:09 -07:00
return ;
}
objstart = ( obj - > objstm ) ? ( const char * ) ( obj - > start + obj - > objstm - > streambuf )
: ( const char * ) ( obj - > start + pdf - > map ) ;
ctx = pdf - > ctx ;
if ( ! ( SCAN_COLLECT_METADATA ) | | ! ( SCAN_STORE_PDF_URIS ) ) {
return ;
}
if ( obj - > size = = 0 ) {
return ;
}
if ( obj - > objstm ) {
bytesleft = MIN ( obj - > size , obj - > objstm - > streambuf_len - obj - > start ) ;
} else {
bytesleft = MIN ( obj - > size , pdf - > size - obj - > start ) ;
}
// Advance forward to the first '(' character
size_t start = 0 ;
while ( bytesleft > 0 & & objstart [ start ] ! = ' ( ' ) {
start + + ;
bytesleft - - ;
}
if ( bytesleft = = 0 ) {
return ;
}
// The first character past '(' is the start of the URI
uri_start = ( char * ) ( objstart + start + 1 ) ;
bytesleft - - ;
// Advance forward to the first ')' character
size_t end = 0 ;
while ( bytesleft > 0 & & uri_start [ end ] ! = ' ) ' ) {
end + + ;
bytesleft - - ;
}
if ( uri_start [ end ] ! = ' ) ' ) {
return ;
}
// Create a new string containing only the URI
CLI_MAX_MALLOC_OR_GOTO_DONE ( uri_heap , end + 1 ,
cli_errmsg ( " cli_pdf: malloc() failed (URI) \n " ) ) ;
strncpy ( uri_heap , uri_start , end ) ;
uri_heap [ end ] = ' \0 ' ;
2025-06-08 01:12:33 -04:00
uriarr = cli_jsonarray ( pdf - > ctx - > this_layer_metadata_json , " URIs " ) ;
2025-04-07 16:50:09 -07:00
if ( ! uriarr ) {
cli_errmsg ( " cli_pdf: malloc() failed (URI array) \n " ) ;
goto done ;
}
cli_jsonstr ( uriarr , NULL , uri_heap ) ;
done :
free ( uri_heap ) ;
}
2022-07-14 18:56:37 -07:00
static void pdf_free_stats ( struct pdf_struct * pdf )
{
2022-05-31 19:15:06 -04:00
if ( ! pdf ) {
return ;
}
if ( ( pdf - > stats . author ) ) {
if ( pdf - > stats . author - > data )
free ( pdf - > stats . author - > data ) ;
free ( pdf - > stats . author ) ;
pdf - > stats . author = NULL ;
}
if ( pdf - > stats . creator ) {
if ( pdf - > stats . creator - > data )
free ( pdf - > stats . creator - > data ) ;
free ( pdf - > stats . creator ) ;
pdf - > stats . creator = NULL ;
}
if ( pdf - > stats . producer ) {
if ( pdf - > stats . producer - > data )
free ( pdf - > stats . producer - > data ) ;
free ( pdf - > stats . producer ) ;
pdf - > stats . producer = NULL ;
}
if ( pdf - > stats . modificationdate ) {
if ( pdf - > stats . modificationdate - > data )
free ( pdf - > stats . modificationdate - > data ) ;
free ( pdf - > stats . modificationdate ) ;
pdf - > stats . modificationdate = NULL ;
}
if ( pdf - > stats . creationdate ) {
if ( pdf - > stats . creationdate - > data )
free ( pdf - > stats . creationdate - > data ) ;
free ( pdf - > stats . creationdate ) ;
pdf - > stats . creationdate = NULL ;
}
if ( pdf - > stats . title ) {
if ( pdf - > stats . title - > data )
free ( pdf - > stats . title - > data ) ;
free ( pdf - > stats . title ) ;
pdf - > stats . title = NULL ;
}
if ( pdf - > stats . subject ) {
if ( pdf - > stats . subject - > data )
free ( pdf - > stats . subject - > data ) ;
free ( pdf - > stats . subject ) ;
pdf - > stats . subject = NULL ;
}
if ( pdf - > stats . keywords ) {
if ( pdf - > stats . keywords - > data )
free ( pdf - > stats . keywords - > data ) ;
free ( pdf - > stats . keywords ) ;
pdf - > stats . keywords = NULL ;
}
}
2014-04-29 17:27:02 -04:00
static void pdf_export_json ( struct pdf_struct * pdf )
{
2023-04-19 16:10:16 -07:00
cli_ctx * ctx = NULL ;
2014-04-29 17:27:02 -04:00
json_object * pdfobj ;
2014-06-25 16:26:33 -04:00
unsigned long i ;
2014-04-29 17:27:02 -04:00
2023-04-19 16:10:16 -07:00
if ( NULL = = pdf )
2014-04-29 17:27:02 -04:00
return ;
2014-06-10 13:52:15 -04:00
if ( ! ( pdf - > ctx ) ) {
goto cleanup ;
}
2014-04-29 17:27:02 -04:00
2023-04-19 16:10:16 -07:00
ctx = pdf - > ctx ;
2025-06-08 01:12:33 -04:00
if ( ! ( SCAN_COLLECT_METADATA ) | | ! ( pdf - > ctx - > this_layer_metadata_json ) ) {
2014-06-10 13:52:15 -04:00
goto cleanup ;
}
2014-04-29 17:27:02 -04:00
2025-06-08 01:12:33 -04:00
pdfobj = cli_jsonobj ( pdf - > ctx - > this_layer_metadata_json , " PDFStats " ) ;
2014-06-10 13:52:15 -04:00
if ( ! ( pdfobj ) ) {
goto cleanup ;
}
2014-04-29 17:27:02 -04:00
2015-03-02 19:06:23 -05:00
if ( pdf - > stats . author ) {
2015-04-01 17:41:59 -04:00
if ( ! pdf - > stats . author - > meta . success ) {
char * out = pdf_finalize_string ( pdf , pdf - > stats . author - > meta . obj , pdf - > stats . author - > data , pdf - > stats . author - > meta . length ) ;
if ( out ) {
free ( pdf - > stats . author - > data ) ;
2018-12-03 12:40:13 -05:00
pdf - > stats . author - > data = out ;
pdf - > stats . author - > meta . length = strlen ( out ) ;
2015-04-01 17:41:59 -04:00
pdf - > stats . author - > meta . success = 1 ;
2015-03-20 16:36:41 -04:00
}
2015-03-02 19:06:23 -05:00
}
2015-04-01 17:41:59 -04:00
if ( pdf - > stats . author - > meta . success & & cli_isutf8 ( pdf - > stats . author - > data , pdf - > stats . author - > meta . length ) ) {
cli_jsonstr ( pdfobj , " Author " , pdf - > stats . author - > data ) ;
2015-04-08 11:09:52 -04:00
} else if ( pdf - > stats . author - > data & & pdf - > stats . author - > meta . length ) {
2015-04-14 15:53:17 -04:00
char * b64 = cl_base64_encode ( pdf - > stats . author - > data , pdf - > stats . author - > meta . length ) ;
2015-04-01 17:41:59 -04:00
cli_jsonstr ( pdfobj , " Author " , b64 ) ;
cli_jsonbool ( pdfobj , " Author_base64 " , 1 ) ;
free ( b64 ) ;
2015-04-14 15:53:17 -04:00
} else {
cli_jsonstr ( pdfobj , " Author " , " " ) ;
2015-04-01 17:41:59 -04:00
}
2015-03-02 19:06:23 -05:00
}
if ( pdf - > stats . creator ) {
2015-04-01 17:41:59 -04:00
if ( ! pdf - > stats . creator - > meta . success ) {
char * out = pdf_finalize_string ( pdf , pdf - > stats . creator - > meta . obj , pdf - > stats . creator - > data , pdf - > stats . creator - > meta . length ) ;
if ( out ) {
free ( pdf - > stats . creator - > data ) ;
2018-12-03 12:40:13 -05:00
pdf - > stats . creator - > data = out ;
pdf - > stats . creator - > meta . length = strlen ( out ) ;
2015-04-01 17:41:59 -04:00
pdf - > stats . creator - > meta . success = 1 ;
2015-03-20 16:36:41 -04:00
}
2015-03-02 19:06:23 -05:00
}
2015-04-01 17:41:59 -04:00
if ( pdf - > stats . creator - > meta . success & & cli_isutf8 ( pdf - > stats . creator - > data , pdf - > stats . creator - > meta . length ) ) {
cli_jsonstr ( pdfobj , " Creator " , pdf - > stats . creator - > data ) ;
2015-04-08 11:09:52 -04:00
} else if ( pdf - > stats . creator - > data & & pdf - > stats . creator - > meta . length ) {
2015-04-14 15:53:17 -04:00
char * b64 = cl_base64_encode ( pdf - > stats . creator - > data , pdf - > stats . creator - > meta . length ) ;
2015-04-01 17:41:59 -04:00
cli_jsonstr ( pdfobj , " Creator " , b64 ) ;
cli_jsonbool ( pdfobj , " Creator_base64 " , 1 ) ;
free ( b64 ) ;
2015-04-14 15:53:17 -04:00
} else {
cli_jsonstr ( pdfobj , " Creator " , " " ) ;
2015-04-01 17:41:59 -04:00
}
2015-03-02 19:06:23 -05:00
}
if ( pdf - > stats . producer ) {
2015-04-01 17:41:59 -04:00
if ( ! pdf - > stats . producer - > meta . success ) {
char * out = pdf_finalize_string ( pdf , pdf - > stats . producer - > meta . obj , pdf - > stats . producer - > data , pdf - > stats . producer - > meta . length ) ;
if ( out ) {
free ( pdf - > stats . producer - > data ) ;
2018-12-03 12:40:13 -05:00
pdf - > stats . producer - > data = out ;
pdf - > stats . producer - > meta . length = strlen ( out ) ;
2015-04-01 17:41:59 -04:00
pdf - > stats . producer - > meta . success = 1 ;
2015-03-20 16:36:41 -04:00
}
2015-03-02 19:06:23 -05:00
}
2015-04-01 17:41:59 -04:00
if ( pdf - > stats . producer - > meta . success & & cli_isutf8 ( pdf - > stats . producer - > data , pdf - > stats . producer - > meta . length ) ) {
cli_jsonstr ( pdfobj , " Producer " , pdf - > stats . producer - > data ) ;
2015-04-08 11:09:52 -04:00
} else if ( pdf - > stats . producer - > data & & pdf - > stats . producer - > meta . length ) {
2015-04-14 15:53:17 -04:00
char * b64 = cl_base64_encode ( pdf - > stats . producer - > data , pdf - > stats . producer - > meta . length ) ;
2015-04-01 17:41:59 -04:00
cli_jsonstr ( pdfobj , " Producer " , b64 ) ;
cli_jsonbool ( pdfobj , " Producer_base64 " , 1 ) ;
free ( b64 ) ;
2015-04-14 15:53:17 -04:00
} else {
cli_jsonstr ( pdfobj , " Producer " , " " ) ;
2015-04-01 17:41:59 -04:00
}
2015-03-02 19:06:23 -05:00
}
if ( pdf - > stats . modificationdate ) {
2015-04-01 17:41:59 -04:00
if ( ! pdf - > stats . modificationdate - > meta . success ) {
char * out = pdf_finalize_string ( pdf , pdf - > stats . modificationdate - > meta . obj , pdf - > stats . modificationdate - > data , pdf - > stats . modificationdate - > meta . length ) ;
if ( out ) {
free ( pdf - > stats . modificationdate - > data ) ;
2018-12-03 12:40:13 -05:00
pdf - > stats . modificationdate - > data = out ;
pdf - > stats . modificationdate - > meta . length = strlen ( out ) ;
2015-04-01 17:41:59 -04:00
pdf - > stats . modificationdate - > meta . success = 1 ;
2015-03-20 16:36:41 -04:00
}
2015-03-02 19:06:23 -05:00
}
2015-04-01 17:41:59 -04:00
if ( pdf - > stats . modificationdate - > meta . success & & cli_isutf8 ( pdf - > stats . modificationdate - > data , pdf - > stats . modificationdate - > meta . length ) ) {
cli_jsonstr ( pdfobj , " ModificationDate " , pdf - > stats . modificationdate - > data ) ;
2015-04-08 11:09:52 -04:00
} else if ( pdf - > stats . modificationdate - > data & & pdf - > stats . modificationdate - > meta . length ) {
2015-04-14 15:53:17 -04:00
char * b64 = cl_base64_encode ( pdf - > stats . modificationdate - > data , pdf - > stats . modificationdate - > meta . length ) ;
2015-04-01 17:41:59 -04:00
cli_jsonstr ( pdfobj , " ModificationDate " , b64 ) ;
cli_jsonbool ( pdfobj , " ModificationDate_base64 " , 1 ) ;
free ( b64 ) ;
2015-04-14 15:53:17 -04:00
} else {
cli_jsonstr ( pdfobj , " ModificationDate " , " " ) ;
2015-04-01 17:41:59 -04:00
}
2015-03-02 19:06:23 -05:00
}
if ( pdf - > stats . creationdate ) {
2015-04-01 17:41:59 -04:00
if ( ! pdf - > stats . creationdate - > meta . success ) {
char * out = pdf_finalize_string ( pdf , pdf - > stats . creationdate - > meta . obj , pdf - > stats . creationdate - > data , pdf - > stats . creationdate - > meta . length ) ;
if ( out ) {
free ( pdf - > stats . creationdate - > data ) ;
2018-12-03 12:40:13 -05:00
pdf - > stats . creationdate - > data = out ;
pdf - > stats . creationdate - > meta . length = strlen ( out ) ;
2015-04-01 17:41:59 -04:00
pdf - > stats . creationdate - > meta . success = 1 ;
2015-03-20 16:36:41 -04:00
}
2015-03-02 19:06:23 -05:00
}
2015-04-01 17:41:59 -04:00
if ( pdf - > stats . creationdate - > meta . success & & cli_isutf8 ( pdf - > stats . creationdate - > data , pdf - > stats . creationdate - > meta . length ) ) {
cli_jsonstr ( pdfobj , " CreationDate " , pdf - > stats . creationdate - > data ) ;
2015-04-08 11:09:52 -04:00
} else if ( pdf - > stats . creationdate - > data & & pdf - > stats . creationdate - > meta . length ) {
2015-04-14 15:53:17 -04:00
char * b64 = cl_base64_encode ( pdf - > stats . creationdate - > data , pdf - > stats . creationdate - > meta . length ) ;
2015-04-01 17:41:59 -04:00
cli_jsonstr ( pdfobj , " CreationDate " , b64 ) ;
cli_jsonbool ( pdfobj , " CreationDate_base64 " , 1 ) ;
free ( b64 ) ;
2015-04-14 15:53:17 -04:00
} else {
cli_jsonstr ( pdfobj , " CreationDate " , " " ) ;
2015-04-01 17:41:59 -04:00
}
2015-03-02 19:06:23 -05:00
}
if ( pdf - > stats . title ) {
2015-04-01 17:41:59 -04:00
if ( ! pdf - > stats . title - > meta . success ) {
char * out = pdf_finalize_string ( pdf , pdf - > stats . title - > meta . obj , pdf - > stats . title - > data , pdf - > stats . title - > meta . length ) ;
if ( out ) {
free ( pdf - > stats . title - > data ) ;
2018-12-03 12:40:13 -05:00
pdf - > stats . title - > data = out ;
pdf - > stats . title - > meta . length = strlen ( out ) ;
2015-04-01 17:41:59 -04:00
pdf - > stats . title - > meta . success = 1 ;
2015-03-20 16:36:41 -04:00
}
2015-03-02 19:06:23 -05:00
}
2015-04-01 17:41:59 -04:00
if ( pdf - > stats . title - > meta . success & & cli_isutf8 ( pdf - > stats . title - > data , pdf - > stats . title - > meta . length ) ) {
cli_jsonstr ( pdfobj , " Title " , pdf - > stats . title - > data ) ;
2015-04-08 11:09:52 -04:00
} else if ( pdf - > stats . title - > data & & pdf - > stats . title - > meta . length ) {
2015-04-14 15:53:17 -04:00
char * b64 = cl_base64_encode ( pdf - > stats . title - > data , pdf - > stats . title - > meta . length ) ;
2015-04-01 17:41:59 -04:00
cli_jsonstr ( pdfobj , " Title " , b64 ) ;
cli_jsonbool ( pdfobj , " Title_base64 " , 1 ) ;
free ( b64 ) ;
2015-04-14 15:53:17 -04:00
} else {
cli_jsonstr ( pdfobj , " Title " , " " ) ;
2015-04-01 17:41:59 -04:00
}
2015-03-02 19:06:23 -05:00
}
if ( pdf - > stats . subject ) {
2015-04-01 17:41:59 -04:00
if ( ! pdf - > stats . subject - > meta . success ) {
char * out = pdf_finalize_string ( pdf , pdf - > stats . subject - > meta . obj , pdf - > stats . subject - > data , pdf - > stats . subject - > meta . length ) ;
if ( out ) {
free ( pdf - > stats . subject - > data ) ;
2018-12-03 12:40:13 -05:00
pdf - > stats . subject - > data = out ;
pdf - > stats . subject - > meta . length = strlen ( out ) ;
2015-04-01 17:41:59 -04:00
pdf - > stats . subject - > meta . success = 1 ;
2015-03-20 16:36:41 -04:00
}
2015-03-02 19:06:23 -05:00
}
2015-04-01 17:41:59 -04:00
if ( pdf - > stats . subject - > meta . success & & cli_isutf8 ( pdf - > stats . subject - > data , pdf - > stats . subject - > meta . length ) ) {
cli_jsonstr ( pdfobj , " Subject " , pdf - > stats . subject - > data ) ;
2015-04-08 11:09:52 -04:00
} else if ( pdf - > stats . subject - > data & & pdf - > stats . subject - > meta . length ) {
2015-04-14 15:53:17 -04:00
char * b64 = cl_base64_encode ( pdf - > stats . subject - > data , pdf - > stats . subject - > meta . length ) ;
2015-04-01 17:41:59 -04:00
cli_jsonstr ( pdfobj , " Subject " , b64 ) ;
cli_jsonbool ( pdfobj , " Subject_base64 " , 1 ) ;
free ( b64 ) ;
2015-04-14 15:53:17 -04:00
} else {
cli_jsonstr ( pdfobj , " Subject " , " " ) ;
2015-04-01 17:41:59 -04:00
}
2015-03-02 19:06:23 -05:00
}
if ( pdf - > stats . keywords ) {
2015-04-01 17:41:59 -04:00
if ( ! pdf - > stats . keywords - > meta . success ) {
char * out = pdf_finalize_string ( pdf , pdf - > stats . keywords - > meta . obj , pdf - > stats . keywords - > data , pdf - > stats . keywords - > meta . length ) ;
if ( out ) {
free ( pdf - > stats . keywords - > data ) ;
2018-12-03 12:40:13 -05:00
pdf - > stats . keywords - > data = out ;
pdf - > stats . keywords - > meta . length = strlen ( out ) ;
2015-04-01 17:41:59 -04:00
pdf - > stats . keywords - > meta . success = 1 ;
2015-03-20 16:36:41 -04:00
}
2015-03-02 19:06:23 -05:00
}
2015-04-01 17:41:59 -04:00
if ( pdf - > stats . keywords - > meta . success & & cli_isutf8 ( pdf - > stats . keywords - > data , pdf - > stats . keywords - > meta . length ) ) {
cli_jsonstr ( pdfobj , " Keywords " , pdf - > stats . keywords - > data ) ;
2015-04-08 11:09:52 -04:00
} else if ( pdf - > stats . keywords - > data & & pdf - > stats . keywords - > meta . length ) {
2015-04-14 15:53:17 -04:00
char * b64 = cl_base64_encode ( pdf - > stats . keywords - > data , pdf - > stats . keywords - > meta . length ) ;
2015-04-01 17:41:59 -04:00
cli_jsonstr ( pdfobj , " Keywords " , b64 ) ;
cli_jsonbool ( pdfobj , " Keywords_base64 " , 1 ) ;
free ( b64 ) ;
2015-04-14 15:53:17 -04:00
} else {
2015-04-14 16:02:37 -04:00
cli_jsonstr ( pdfobj , " Keywords " , " " ) ;
2015-04-01 17:41:59 -04:00
}
2015-03-02 19:06:23 -05:00
}
2014-04-29 17:27:02 -04:00
if ( pdf - > stats . ninvalidobjs )
cli_jsonint ( pdfobj , " InvalidObjectCount " , pdf - > stats . ninvalidobjs ) ;
if ( pdf - > stats . njs )
cli_jsonint ( pdfobj , " JavaScriptObjectCount " , pdf - > stats . njs ) ;
if ( pdf - > stats . nflate )
cli_jsonint ( pdfobj , " DeflateObjectCount " , pdf - > stats . nflate ) ;
if ( pdf - > stats . nactivex )
cli_jsonint ( pdfobj , " ActiveXObjectCount " , pdf - > stats . nactivex ) ;
if ( pdf - > stats . nflash )
cli_jsonint ( pdfobj , " FlashObjectCount " , pdf - > stats . nflash ) ;
if ( pdf - > stats . ncolors )
cli_jsonint ( pdfobj , " ColorCount " , pdf - > stats . ncolors ) ;
if ( pdf - > stats . nasciihexdecode )
cli_jsonint ( pdfobj , " AsciiHexDecodeObjectCount " , pdf - > stats . nasciihexdecode ) ;
if ( pdf - > stats . nascii85decode )
cli_jsonint ( pdfobj , " Ascii85DecodeObjectCount " , pdf - > stats . nascii85decode ) ;
if ( pdf - > stats . nembeddedfile )
cli_jsonint ( pdfobj , " EmbeddedFileCount " , pdf - > stats . nembeddedfile ) ;
if ( pdf - > stats . nimage )
cli_jsonint ( pdfobj , " ImageCount " , pdf - > stats . nimage ) ;
if ( pdf - > stats . nlzw )
cli_jsonint ( pdfobj , " LZWCount " , pdf - > stats . nlzw ) ;
if ( pdf - > stats . nrunlengthdecode )
cli_jsonint ( pdfobj , " RunLengthDecodeCount " , pdf - > stats . nrunlengthdecode ) ;
if ( pdf - > stats . nfaxdecode )
cli_jsonint ( pdfobj , " FaxDecodeCount " , pdf - > stats . nfaxdecode ) ;
if ( pdf - > stats . njbig2decode )
cli_jsonint ( pdfobj , " JBIG2DecodeCount " , pdf - > stats . njbig2decode ) ;
if ( pdf - > stats . ndctdecode )
cli_jsonint ( pdfobj , " DCTDecodeCount " , pdf - > stats . ndctdecode ) ;
if ( pdf - > stats . njpxdecode )
cli_jsonint ( pdfobj , " JPXDecodeCount " , pdf - > stats . njpxdecode ) ;
if ( pdf - > stats . ncrypt )
cli_jsonint ( pdfobj , " CryptCount " , pdf - > stats . ncrypt ) ;
if ( pdf - > stats . nstandard )
cli_jsonint ( pdfobj , " StandardCount " , pdf - > stats . nstandard ) ;
if ( pdf - > stats . nsigned )
cli_jsonint ( pdfobj , " SignedCount " , pdf - > stats . nsigned ) ;
if ( pdf - > stats . nopenaction )
cli_jsonint ( pdfobj , " OpenActionCount " , pdf - > stats . nopenaction ) ;
if ( pdf - > stats . nlaunch )
cli_jsonint ( pdfobj , " LaunchCount " , pdf - > stats . nlaunch ) ;
if ( pdf - > stats . npage )
cli_jsonint ( pdfobj , " PageCount " , pdf - > stats . npage ) ;
2014-06-27 12:43:23 -04:00
if ( pdf - > stats . nrichmedia )
cli_jsonint ( pdfobj , " RichMediaCount " , pdf - > stats . nrichmedia ) ;
if ( pdf - > stats . nacroform )
cli_jsonint ( pdfobj , " AcroFormCount " , pdf - > stats . nacroform ) ;
if ( pdf - > stats . nxfa )
cli_jsonint ( pdfobj , " XFACount " , pdf - > stats . nxfa ) ;
2014-04-30 16:38:55 -04:00
if ( pdf - > flags & ( 1 < < BAD_PDF_VERSION ) )
cli_jsonbool ( pdfobj , " BadVersion " , 1 ) ;
if ( pdf - > flags & ( 1 < < BAD_PDF_HEADERPOS ) )
cli_jsonbool ( pdfobj , " BadHeaderPosition " , 1 ) ;
if ( pdf - > flags & ( 1 < < BAD_PDF_TRAILER ) )
cli_jsonbool ( pdfobj , " BadTrailer " , 1 ) ;
if ( pdf - > flags & ( 1 < < BAD_PDF_TOOMANYOBJS ) )
cli_jsonbool ( pdfobj , " TooManyObjects " , 1 ) ;
if ( pdf - > flags & ( 1 < < ENCRYPTED_PDF ) ) {
cli_jsonbool ( pdfobj , " Encrypted " , 1 ) ;
if ( pdf - > flags & ( 1 < < DECRYPTABLE_PDF ) )
cli_jsonbool ( pdfobj , " Decryptable " , 1 ) ;
2015-03-20 16:44:14 -04:00
else
cli_jsonbool ( pdfobj , " Decryptable " , 0 ) ;
2014-04-30 16:38:55 -04:00
}
2014-06-10 13:52:15 -04:00
2018-12-03 12:40:13 -05:00
for ( i = 0 ; i < pdf - > nobjs ; i + + ) {
if ( pdf - > objs [ i ] - > flags & ( 1 < < OBJ_TRUNCATED ) ) {
2014-06-25 16:26:33 -04:00
json_object * truncobj ;
truncobj = cli_jsonarray ( pdfobj , " TruncatedObjects " ) ;
if ( ! ( truncobj ) )
continue ;
2018-08-14 14:00:31 -07:00
cli_jsonint_array ( truncobj , pdf - > objs [ i ] - > id > > 8 ) ;
2014-06-25 16:26:33 -04:00
}
}
2014-06-10 13:52:15 -04:00
cleanup :
2022-05-31 19:15:06 -04:00
pdf_free_stats ( pdf ) ;
2014-04-29 17:27:02 -04:00
}