clamav/libclamav/pdf.c
John Humlick a5cdc7faf7
libclamav: Fix PDF scan crash on some platforms with optimization enabled (#1572)
Compiler optimization results in invalid memory access on some 
systems with the PDF `pdfname_action` pointer dereference. 

This fix changes the logic so that rather than assign the pointer to
the struct containing the callback, the string that would result in
the pointer assignment later on is changed to result in the same
assignment. This fixes the issue on all tested platforms.

Resolves: https://github.com/Cisco-Talos/clamav/issues/1566

CLAM-2859
2025-09-26 14:14:25 -04:00

5151 lines
168 KiB
C

/*
* Copyright (C) 2013-2025 Cisco Systems, Inc. and/or its affiliates. All rights reserved.
* Copyright (C) 2007-2013 Sourcefire, Inc.
*
* Authors: Nigel Horne, Török Edvin
*
* Also based on Matt Olney's pdf parser in snort-nrt.
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
* published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
* MA 02110-1301, USA.
*
* TODO: Embedded fonts
* TODO: Predictor image handling
*/
#if HAVE_CONFIG_H
#include "clamav-config.h"
#endif
#include <stdio.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <ctype.h>
#include <string.h>
#include <fcntl.h>
#include <stdlib.h>
#include <errno.h>
#ifdef HAVE_LIMITS_H
#include <limits.h>
#endif
#ifdef HAVE_UNISTD_H
#include <unistd.h>
#endif
#include <zlib.h>
#if HAVE_ICONV
#include <iconv.h>
#endif
#ifdef _WIN32
#include <stdint.h>
#endif
#include "clamav.h"
#include "others.h"
#include "pdf.h"
#include "pdfdecode.h"
#include "scanners.h"
#include "fmap.h"
#include "str.h"
#include "entconv.h"
#include "bytecode.h"
#include "bytecode_api.h"
#include "arc4.h"
#include "rijndael.h"
#include "textnorm.h"
#include "conv.h"
#include "json_api.h"
#ifdef CL_DEBUG
/*#define SAVE_TMP
*Save the file being worked on in tmp */
#endif
#define MAX_PDF_OBJECTS (64 * 1024)
struct pdf_struct;
static const char *pdf_nextlinestart(const char *ptr, size_t len);
static const char *pdf_nextobject(const char *ptr, size_t len);
/* PDF statistics callbacks and related */
struct pdfname_action;
static void pdf_export_json(struct pdf_struct *);
static void ASCIIHexDecode_cb(struct pdf_struct *, struct pdf_obj *, struct pdfname_action *);
static void ASCII85Decode_cb(struct pdf_struct *, struct pdf_obj *, struct pdfname_action *);
static void EmbeddedFile_cb(struct pdf_struct *, struct pdf_obj *, struct pdfname_action *);
static void FlateDecode_cb(struct pdf_struct *, struct pdf_obj *, struct pdfname_action *);
static void Image_cb(struct pdf_struct *, struct pdf_obj *, struct pdfname_action *);
static void LZWDecode_cb(struct pdf_struct *, struct pdf_obj *, struct pdfname_action *);
static void RunLengthDecode_cb(struct pdf_struct *, struct pdf_obj *, struct pdfname_action *);
static void CCITTFaxDecode_cb(struct pdf_struct *, struct pdf_obj *, struct pdfname_action *);
static void JBIG2Decode_cb(struct pdf_struct *, struct pdf_obj *, struct pdfname_action *);
static void DCTDecode_cb(struct pdf_struct *, struct pdf_obj *, struct pdfname_action *);
static void JPXDecode_cb(struct pdf_struct *, struct pdf_obj *, struct pdfname_action *);
static void Crypt_cb(struct pdf_struct *, struct pdf_obj *, struct pdfname_action *);
static void Standard_cb(struct pdf_struct *, struct pdf_obj *, struct pdfname_action *);
static void Sig_cb(struct pdf_struct *, struct pdf_obj *, struct pdfname_action *);
static void JavaScript_cb(struct pdf_struct *, struct pdf_obj *, struct pdfname_action *);
static void OpenAction_cb(struct pdf_struct *, struct pdf_obj *, struct pdfname_action *);
static void Launch_cb(struct pdf_struct *, struct pdf_obj *, struct pdfname_action *);
static void Page_cb(struct pdf_struct *, struct pdf_obj *, struct pdfname_action *);
static void Author_cb(struct pdf_struct *, struct pdf_obj *, struct pdfname_action *);
static void Creator_cb(struct pdf_struct *, struct pdf_obj *, struct pdfname_action *);
static void Producer_cb(struct pdf_struct *, struct pdf_obj *, struct pdfname_action *);
static void CreationDate_cb(struct pdf_struct *, struct pdf_obj *, struct pdfname_action *);
static void ModificationDate_cb(struct pdf_struct *, struct pdf_obj *, struct pdfname_action *);
static void Title_cb(struct pdf_struct *, struct pdf_obj *, struct pdfname_action *);
static void Subject_cb(struct pdf_struct *, struct pdf_obj *, struct pdfname_action *);
static void Keywords_cb(struct pdf_struct *, struct pdf_obj *, struct pdfname_action *);
static void Pages_cb(struct pdf_struct *, struct pdf_obj *, struct pdfname_action *);
static void Colors_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfname_action *act);
static void RichMedia_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfname_action *act);
static void AcroForm_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfname_action *act);
static void XFA_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfname_action *act);
static void URI_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfname_action *act);
/* End PDF statistics callbacks and related */
static int pdf_readint(const char *q0, int len, const char *key);
static const char *pdf_getdict(const char *q0, int *len, const char *key);
static char *pdf_readval(const char *q, int len, const char *key);
static char *pdf_readstring(const char *q0, int len, const char *key, unsigned *slen, const char **qend, bool noescape);
static int xrefCheck(const char *xref, const char *eof)
{
const char *q;
while (xref < eof && (*xref == ' ' || *xref == '\n' || *xref == '\r'))
xref++;
if (xref + 4 >= eof)
return -1;
if (!memcmp(xref, "xref", strlen("xref"))) {
cli_dbgmsg("cli_pdf: found xref\n");
return 0;
}
/* could be xref stream */
for (q = xref; q + 5 < eof; q++) {
if (!memcmp(q, "/XRef", strlen("/XRef"))) {
cli_dbgmsg("cli_pdf: found /XRef\n");
return 0;
}
}
return -1;
}
/* define this to be noisy about things that we can't parse properly */
#undef NOISY
#ifdef NOISY
#define noisy_msg(pdf, ...) cli_infomsg(pdf->ctx, __VA_ARGS__)
#define noisy_warnmsg(...) cli_warnmsg(__VA_ARGS__)
#else
#define noisy_msg(pdf, ...)
#define noisy_warnmsg(...)
#endif
/**
* @brief Searching BACKwards, find the next character that is not a whitespace.
*
* @param q Index to start from (at the end of the search space)
* @param start Beginning of the search space.
*
* @return const char* Address of the final non-whitespace character OR the same address as the start.
*/
static const char *findNextNonWSBack(const char *q, const char *start)
{
while (q > start && (*q == 0 || *q == 9 || *q == 0xa || *q == 0xc || *q == 0xd || *q == 0x20))
q--;
return q;
}
/**
* @brief Searching FORwards, find the next character that is not a whitespace.
*
* @param q Index to start from (at the end of the search space)
* @param end End of the search space.
*
* @return const char* Address of the final non-whitespace character OR the same address as the start.
*/
static const char *findNextNonWS(const char *q, const char *end)
{
while (q < end && (*q == 0 || *q == 9 || *q == 0xa || *q == 0xc || *q == 0xd || *q == 0x20))
q++;
return q;
}
/**
* @brief Find bounds of stream.
*
* PDF streams are prefixed with "stream" and suffixed with "endstream".
* Return value indicates success or failure.
*
* @param start start address of search space.
* @param size size of search space
* @param[out] stream output param, address of start of stream data
* @param[out] stream_size output param, size of stream data
* @param newline_hack hack to support newlines that are \r\n, and not just \n or just \r.
*
* @return cl_error_t CL_SUCCESS if stream bounds were found.
* @return cl_error_t CL_BREAK if stream bounds could not be found.
* @return cl_error_t CL_EFORMAT if stream start was found, but not end. (truncated)
* @return cl_error_t CL_EARG if invalid args were provided.
*/
static cl_error_t find_stream_bounds(
const char *start,
size_t size,
const char **stream,
size_t *stream_size,
int newline_hack)
{
cl_error_t status = CL_BREAK;
const char *idx;
const char *stream_begin;
const char *endstream_begin;
size_t bytesleft = size;
if ((NULL == start) || (0 == bytesleft) || (NULL == stream) || (NULL == stream_size)) {
status = CL_EARG;
return status;
}
*stream = NULL;
*stream_size = 0;
/* Begin by finding the "stream" string that prefixes stream data. */
if ((stream_begin = cli_memstr(start, bytesleft, "stream", strlen("stream")))) {
idx = stream_begin + strlen("stream");
if ((size_t)(idx - start) >= bytesleft)
goto done;
bytesleft -= idx - start;
/* Skip any new line characters. */
if (bytesleft >= 2 && idx[0] == '\xd' && idx[1] == '\xa') {
idx += 2;
bytesleft -= 2;
if (newline_hack && (bytesleft > 2) && idx[0] == '\xa') {
idx++;
bytesleft--;
}
} else if (bytesleft && idx[0] == '\xa') {
idx++;
bytesleft--;
}
/* Pass back start of the stream data. */
*stream = idx;
/* Now find the "endstream" string that suffixes stream data. */
endstream_begin = cli_memstr(idx, bytesleft, "endstream", strlen("endstream"));
if (!endstream_begin) {
/* Couldn't find "endstream", but that's ok --
* -- we'll just count the rest of the provided buffer. */
cli_dbgmsg("find_stream_bounds: Truncated stream found!\n");
endstream_begin = start + size;
status = CL_EFORMAT;
}
/* Pass back end of the stream data, as offset from start. */
*stream_size = endstream_begin - *stream;
if (CL_EFORMAT != status)
status = CL_SUCCESS;
}
done:
return status;
}
/**
* @brief Find the next *indirect* object in an object stream, adds it to our list of
* objects, and increments nobj.
*
* Indirect objects in a stream DON'T begin with "obj" and end with "endobj".
* Instead, they have an objid and an offset from the first object to point you
* right at them.
*
* If found, objstm->current will be updated to the next objid.
*
* All objects in an object stream are indirect and thus do not begin or start
* with "obj" or "endobj". Instead, the object stream takes the following
* format.
*
* <dictionary describing stream> objstm content endobjstm
*
* where content looks something like the following:
*
* 15 0 16 3 17 46 (ab)<</IDS 8 0 R/JavaScript 27 0 R/URLS 9 0 R>><</Names[(Test)28 0 R]>>
*
* In the above example, the literal string (ab) is indirect object # 15, and
* begins at offset 0 of the set of objects. The next object, # 16 begis at
* offset 3 is a dictionary. The final object is also a dictionary, beginning
* at offset 46.
*
* @param pdf Pdf struct that keeps track of all information found in the PDF.
* @param objstm
*
* @return CL_SUCCESS if success
* @return CL_EPARSE if parsing error
* @return CL_EMEM if error allocating memory
* @return CL_EARG if invalid arguments
*/
int pdf_findobj_in_objstm(struct pdf_struct *pdf, struct objstm_struct *objstm, struct pdf_obj **obj_found)
{
cl_error_t status = CL_EPARSE;
struct pdf_obj *obj = NULL;
unsigned long objid = 0, objoff = 0;
long temp_long = 0;
const char *index = NULL;
size_t bytes_remaining = 0;
if (NULL == pdf || NULL == objstm) {
cli_warnmsg("pdf_findobj_in_objstm: invalid arguments\n");
return CL_EARG;
}
if (pdf->nobjs >= MAX_PDF_OBJECTS) {
pdf->flags |= 1 << BAD_PDF_TOOMANYOBJS;
cli_dbgmsg("pdf_findobj_in_objstm: reached object maximum\n");
status = CL_BREAK;
goto done;
}
*obj_found = NULL;
index = objstm->streambuf + objstm->current_pair;
bytes_remaining = objstm->streambuf_len - objstm->current_pair;
obj = calloc(sizeof(struct pdf_obj), 1);
if (!obj) {
cli_warnmsg("pdf_findobj_in_objstm: out of memory finding objects in stream\n");
status = CL_EMEM;
goto done;
}
/* This object is in a stream, not in the regular map buffer. */
obj->objstm = objstm;
/* objstm->current_pair points directly to the objid */
if (CL_SUCCESS != cli_strntol_wrap(index, bytes_remaining, 0, 10, &temp_long)) {
/* Failed to find objid */
cli_dbgmsg("pdf_findobj_in_objstm: Failed to find objid for obj in object stream\n");
status = CL_EPARSE;
goto done;
} else if (temp_long < 0) {
cli_dbgmsg("pdf_findobj_in_objstm: Encountered invalid negative objid (%ld).\n", temp_long);
status = CL_EPARSE;
goto done;
}
objid = (unsigned long)temp_long;
/* Find the obj offset that appears just after the objid*/
while ((index < objstm->streambuf + objstm->streambuf_len) && isdigit(*index)) {
index++;
bytes_remaining--;
}
index = findNextNonWS(index, objstm->streambuf + objstm->first);
bytes_remaining = objstm->streambuf + objstm->streambuf_len - index;
if (CL_SUCCESS != cli_strntol_wrap(index, bytes_remaining, 0, 10, &temp_long)) {
/* Failed to find obj offset */
cli_dbgmsg("pdf_findobj_in_objstm: Failed to find obj offset for obj in object stream\n");
status = CL_EPARSE;
goto done;
} else if (temp_long < 0) {
cli_dbgmsg("pdf_findobj_in_objstm: Encountered invalid negative obj offset (%ld).\n", temp_long);
status = CL_EPARSE;
goto done;
}
objoff = (unsigned long)temp_long;
if ((size_t)objstm->first + (size_t)objoff > objstm->streambuf_len) {
/* Alleged obj location is further than the length of the stream */
cli_dbgmsg("pdf_findobj_in_objstm: obj offset found is greater than the length of the stream.\n");
status = CL_EPARSE;
goto done;
}
objstm->current = objstm->first + objoff;
obj->id = (objid << 8) | (0 & 0xff);
obj->start = objstm->current;
obj->flags = 0;
objstm->nobjs_found++;
while ((index < objstm->streambuf + objstm->streambuf_len) && isdigit(*index)) {
index++;
bytes_remaining--;
}
objstm->current_pair = (uint32_t)(findNextNonWS(index, objstm->streambuf + objstm->first) - objstm->streambuf);
/* Update current_pair, if there are more */
if ((objstm->nobjs_found < objstm->n) &&
(index < objstm->streambuf + objstm->streambuf_len)) {
unsigned long next_objoff = 0;
/*
* While we're at it,
* lets record the size as running up to the next object offset.
*
* To do so, we will need to parse the next obj pair.
*/
/* objstm->current_pair points directly to the objid */
index = objstm->streambuf + objstm->current_pair;
bytes_remaining = objstm->streambuf + objstm->streambuf_len - index;
/* We don't actually care about the object id at this point, so reading the object id is commented out.
I didn't delete it entirely in case the object id is needed in the future. */
// if (CL_SUCCESS != cli_strntol_wrap(index, bytes_remaining, 0, 10, &temp_long)) {
// /* Failed to find objid for next obj */
// cli_dbgmsg("pdf_findobj_in_objstm: Failed to find next objid for obj in object stream though there should be {%u} more.\n", objstm->n - objstm->nobjs_found);
// status = CL_EPARSE;
// goto done;
// } else if (temp_long < 0) {
// cli_dbgmsg("pdf_findobj_in_objstm: Encountered invalid negative objid (%ld).\n", temp_long);
// status = CL_EPARSE;
// goto done;
// }
// next_objid = (unsigned long)temp_long;
/* Find the obj offset that appears just after the objid*/
while ((index < objstm->streambuf + objstm->streambuf_len) && isdigit(*index)) {
index++;
bytes_remaining--;
}
index = findNextNonWS(index, objstm->streambuf + objstm->first);
bytes_remaining = objstm->streambuf + objstm->streambuf_len - index;
if (CL_SUCCESS != cli_strntol_wrap(index, bytes_remaining, 0, 10, &temp_long)) {
/* Failed to find obj offset for next obj */
cli_dbgmsg("pdf_findobj_in_objstm: Failed to find next obj offset for obj in object stream though there should be {%zu} more.\n", objstm->n - objstm->nobjs_found);
status = CL_EPARSE;
goto done;
} else if (temp_long < 0) {
cli_dbgmsg("pdf_findobj_in_objstm: Encountered invalid negative obj offset (%ld).\n", temp_long);
status = CL_EPARSE;
goto done;
}
next_objoff = (unsigned long)temp_long;
if (next_objoff <= objoff) {
/* Failed to find obj offset for next obj */
cli_dbgmsg("pdf_findobj_in_objstm: Found next obj offset for obj in object stream but it's less than or equal to the current one!\n");
status = CL_EPARSE;
goto done;
} else if (objstm->first + next_objoff > objstm->streambuf_len) {
/* Failed to find obj offset for next obj */
cli_dbgmsg("pdf_findobj_in_objstm: Found next obj offset for obj in object stream but it's further out than the size of the stream!\n");
status = CL_EPARSE;
goto done;
}
obj->size = next_objoff - objoff;
} else {
/*
* Should be no more objects. We should verify.
*
* Either way...
* obj->size should be the rest of the buffer.
*/
if (objstm->nobjs_found < objstm->n) {
cli_warnmsg("pdf_findobj_in_objstm: Fewer objects found in object stream than expected!\n");
}
obj->size = objstm->streambuf_len - obj->start;
}
/* Success! Add the object to the list of all objects found. */
pdf->nobjs++;
CLI_MAX_REALLOC_OR_GOTO_DONE(pdf->objs, sizeof(struct pdf_obj *) * pdf->nobjs,
cli_warnmsg("pdf_findobj_in_objstm: out of memory finding objects in stream\n"),
status = CL_EMEM);
pdf->objs[pdf->nobjs - 1] = obj;
*obj_found = obj;
status = CL_SUCCESS;
done:
if (CL_SUCCESS != status) {
if (NULL != obj) {
free(obj);
}
}
return status;
}
/**
* @brief Find the next *indirect* object.
*
* Indirect objects located outside of an object stream are prefaced with:
* <objid> <genid> obj
*
* Each of the above are separated by whitespace of some sort.
*
* Indirect objects are postfaced with:
* endobj
*
* The specification does not say if whitespace is required before or after "endobj".
*
* Identify truncated objects.
*
* If found, pdf->offset will be updated to just after the "endobj".
* If truncated, pdf->offset will == pdf->size.
* If not found, pdf->offset will not be updated.
*
* @param pdf Pdf context struct that keeps track of all information found in the PDF.
*
* @return CL_SUCCESS if success
* @return CL_BREAK if no more objects
* @return CL_EPARSE if parsing error
* @return CL_EMEM if error allocating memory
*/
cl_error_t pdf_findobj(struct pdf_struct *pdf)
{
cl_error_t status = CL_EPARSE;
const char *start, *idx, *genid_search_index, *objid_search_index;
const char *obj_begin = NULL, *obj_end = NULL;
const char *endobj_begin = NULL, *endobj_end = NULL;
struct pdf_obj *obj = NULL;
size_t bytesleft;
unsigned long genid, objid;
long temp_long;
if (pdf->nobjs >= MAX_PDF_OBJECTS) {
pdf->flags |= 1 << BAD_PDF_TOOMANYOBJS;
cli_dbgmsg("pdf_findobj: reached object maximum\n");
status = CL_BREAK;
goto done;
}
pdf->nobjs++;
CLI_MAX_REALLOC_OR_GOTO_DONE(pdf->objs, sizeof(struct pdf_obj *) * pdf->nobjs, status = CL_EMEM);
obj = malloc(sizeof(struct pdf_obj));
if (!obj) {
status = CL_EMEM;
goto done;
}
pdf->objs[pdf->nobjs - 1] = obj;
memset(obj, 0, sizeof(*obj));
start = pdf->map + pdf->offset;
bytesleft = pdf->size - pdf->offset;
/*
* Start by searching for "obj"
*/
idx = start + 1;
while (bytesleft > 1 + strlen("obj")) {
/* `- 1` accounts for size of white space before obj */
idx = cli_memstr(idx, bytesleft - 1, "obj", strlen("obj"));
if (NULL == idx) {
status = CL_BREAK;
goto done; /* No more objs. */
}
/* verify that the word has a whitespace before it, and is not the end of
* a previous word */
idx--;
bytesleft = (pdf->size - pdf->offset) - (size_t)(idx - start);
if (*idx != 0 && *idx != 9 && *idx != 0xa && *idx != 0xc && *idx != 0xd && *idx != 0x20) {
/* This instance of "obj" appears to be part of a longer string.
* Skip it, and keep searching for an object. */
idx += 1 + strlen("obj");
bytesleft -= 1 + strlen("obj");
continue;
}
/* Found the beginning of the word */
obj_begin = idx;
obj_end = idx + 1 + strlen("obj");
break;
}
if ((NULL == obj_begin) || (NULL == obj_end)) {
status = CL_BREAK;
goto done; /* No more objs. */
}
/* Find the generation id (genid) that appears before the "obj" */
genid_search_index = findNextNonWSBack(obj_begin - 1, start);
while (genid_search_index > start && isdigit(*genid_search_index))
genid_search_index--;
if (CL_SUCCESS != cli_strntol_wrap(genid_search_index, (size_t)((obj_begin)-genid_search_index), 0, 10, &temp_long)) {
cli_dbgmsg("pdf_findobj: Failed to parse object genid (# objects found: %u)\n", pdf->nobjs);
/* Failed to parse, probably not a real object. Skip past the "obj" thing, and continue. */
pdf->offset = obj_end - pdf->map;
status = CL_EPARSE;
goto done;
} else if (temp_long < 0) {
cli_dbgmsg("pdf_findobj: Encountered invalid negative obj genid (%ld).\n", temp_long);
pdf->offset = obj_end - pdf->map;
status = CL_EPARSE;
goto done;
}
genid = (unsigned long)temp_long;
/* Find the object id (objid) that appears before the genid */
objid_search_index = findNextNonWSBack(genid_search_index - 1, start);
while (objid_search_index > start && isdigit(*objid_search_index))
objid_search_index--;
if (CL_SUCCESS != cli_strntol_wrap(objid_search_index, (size_t)((genid_search_index)-objid_search_index), 0, 10, &temp_long)) {
/*
* Edge case:
*
* PDFs with multiple revisions will have %%EOF before the end of the file,
* followed by the next revision of the PDF, which will probably be an immediate objid.
*
* Example:
* %%EOF1 1 obj <blah> endobj
*
* If this is the case, we can detect it and continue parsing after the %%EOF.
*/
if (objid_search_index - strlen("%%EO") > start) {
const char *lastfile = objid_search_index - strlen("%%EO");
if (0 != strncmp(lastfile, "%%EOF", 5)) {
/* Nope, wasn't %%EOF */
cli_dbgmsg("pdf_findobj: Failed to parse object objid (# objects found: %u)\n", pdf->nobjs);
/* Skip past the "obj" thing, and continue. */
pdf->offset = obj_end - pdf->map;
status = CL_EPARSE;
goto done;
}
/* Yup, Looks, like the file continues after %%EOF.
* Probably another revision. Keep parsing... */
objid_search_index++;
cli_dbgmsg("pdf_findobj: %%%%EOF detected before end of file, at offset: %zu\n", (size_t)(objid_search_index - pdf->map));
} else {
/* Failed parsing at the very beginning */
cli_dbgmsg("pdf_findobj: Failed to parse object objid (# objects found: %u)\n", pdf->nobjs);
/* Probably not a real object. Skip past the "obj" thing, and continue. */
pdf->offset = obj_end - pdf->map;
status = CL_EPARSE;
goto done;
}
/* Try again, with offset slightly adjusted */
if (CL_SUCCESS != cli_strntol_wrap(objid_search_index, (size_t)((genid_search_index - 1) - objid_search_index), 0, 10, &temp_long)) {
cli_dbgmsg("pdf_findobj: Failed to parse object objid (# objects found: %u)\n", pdf->nobjs);
/* Still failed... Probably not a real object. Skip past the "obj" thing, and continue. */
pdf->offset = obj_end - pdf->map;
status = CL_EPARSE;
goto done;
} else if (temp_long < 0) {
cli_dbgmsg("pdf_findobj: Encountered invalid negative objid (%ld).\n", temp_long);
pdf->offset = obj_end - pdf->map;
status = CL_EPARSE;
goto done;
}
cli_dbgmsg("pdf_findobj: There appears to be an additional revision. Continuing to parse...\n");
} else if (temp_long < 0) {
cli_dbgmsg("pdf_findobj: Encountered invalid negative objid (%ld).\n", temp_long);
pdf->offset = obj_end - pdf->map;
status = CL_EPARSE;
goto done;
}
objid = (unsigned long)temp_long;
obj->id = (objid << 8) | (genid & 0xff);
obj->start = obj_end - pdf->map; /* obj start begins just after the "obj" string */
obj->flags = 0;
/*
* We now have the objid, genid, and object start.
* Find the object end ("endobj").
*/
/* `- 1` accounts for size of white space before obj */
endobj_begin = cli_memstr(obj_end, pdf->map + pdf->size - obj_end, "endobj", strlen("endobj"));
if (NULL == endobj_begin) {
/* No end to object.
* PDF appears to be malformed or truncated.
* Will record the object size as going ot the end of the file.
* Will record that the object is truncated.
* Will position the pdf offset to the end of the PDF.
* The next iteration of this function will find no more objects. */
obj->flags |= 1 << OBJ_TRUNCATED;
obj->size = (pdf->map + pdf->size) - obj_end;
pdf->offset = pdf->size;
/* Truncated "object" found! */
status = CL_SUCCESS;
goto done;
}
endobj_end = endobj_begin + strlen("endobj");
/* Size of the object goes from "obj" <-> "endobject". */
obj->size = endobj_begin - obj_end;
pdf->offset = endobj_end - pdf->map;
/*
* Object found!
*/
status = CL_SUCCESS; /* truncated file, no end to obj. */
done:
if (status == CL_SUCCESS) {
cli_dbgmsg("pdf_findobj: found %d %d obj @%lld, size: %zu bytes.\n", obj->id >> 8, obj->id & 0xff, (long long)(obj->start + pdf->startoff), obj->size);
} else {
/* Remove the unused obj reference from our list of objects found */
/* No need to realloc pdf->objs back down. It won't leak. */
pdf->objs[pdf->nobjs - 1] = NULL;
pdf->nobjs--;
/* Free up the obj struct. */
if (NULL != obj)
free(obj);
if (status == CL_BREAK) {
cli_dbgmsg("pdf_findobj: No more objects (# objects found: %u)\n", pdf->nobjs);
} else if (status == CL_EMEM) {
cli_warnmsg("pdf_findobj: Error allocating memory (# objects found: %u)\n", pdf->nobjs);
} else {
cli_dbgmsg("pdf_findobj: Unexpected status code %d.\n", status);
}
}
return status;
}
static size_t filter_writen(struct pdf_struct *pdf, struct pdf_obj *obj, int fout, const char *buf, size_t len, size_t *sum)
{
UNUSEDPARAM(obj);
if (cli_checklimits("pdf", pdf->ctx, (uint64_t)*sum, 0, 0))
return len;
*sum += len;
return cli_writen(fout, buf, len);
}
void pdfobj_flag(struct pdf_struct *pdf, struct pdf_obj *obj, enum pdf_flag flag)
{
const char *s = "";
pdf->flags |= 1 << flag;
if (!cli_debug_flag)
return;
switch (flag) {
case UNTERMINATED_OBJ_DICT:
s = "dictionary not terminated";
break;
case ESCAPED_COMMON_PDFNAME:
/* like /JavaScript */
s = "escaped common pdfname";
break;
case BAD_STREAM_FILTERS:
s = "duplicate stream filters";
break;
case BAD_PDF_VERSION:
s = "bad pdf version";
break;
case BAD_PDF_HEADERPOS:
s = "bad pdf header position";
break;
case BAD_PDF_TRAILER:
s = "bad pdf trailer";
break;
case BAD_PDF_TOOMANYOBJS:
s = "too many pdf objs";
break;
case BAD_FLATE:
s = "bad deflate stream";
break;
case BAD_FLATESTART:
s = "bad deflate stream start";
break;
case BAD_STREAMSTART:
s = "bad stream start";
break;
case UNKNOWN_FILTER:
s = "unknown filter used";
break;
case BAD_ASCIIDECODE:
s = "bad ASCII decode";
break;
case HEX_JAVASCRIPT:
s = "hex javascript";
break;
case BAD_INDOBJ:
s = "referencing nonexistent obj";
break;
case HAS_OPENACTION:
s = "has /OpenAction";
break;
case HAS_LAUNCHACTION:
s = "has /LaunchAction";
break;
case BAD_STREAMLEN:
s = "bad /Length, too small";
break;
case ENCRYPTED_PDF:
s = "PDF is encrypted";
break;
case LINEARIZED_PDF:
s = "linearized PDF";
break;
case MANY_FILTERS:
s = "more than 2 filters per obj";
break;
case DECRYPTABLE_PDF:
s = "decryptable PDF";
break;
}
cli_dbgmsg("pdfobj_flag: %s flagged in object %u %u\n", s, obj->id >> 8, obj->id & 0xff);
}
struct pdf_obj *find_obj(struct pdf_struct *pdf, struct pdf_obj *obj, uint32_t objid)
{
uint32_t j;
uint32_t i;
/* search starting at previous obj (if exists) */
for (i = 0; i < pdf->nobjs; i++) {
if (pdf->objs[i] == obj)
break;
}
for (j = i; j < pdf->nobjs; j++) {
obj = pdf->objs[j];
if (obj->id == objid)
return obj;
}
/* restart search from beginning if not found */
for (j = 0; j < i; j++) {
obj = pdf->objs[j];
if (obj->id == objid)
return obj;
}
return NULL;
}
/**
* @brief Find and interpret the "/Length" dictionary key value.
*
* The value may be:
* - a direct object (i.e. just a number)
* - an indirect object, where the value is somewhere else in the document and we have to look it up.
* indirect objects are referenced using an object id (objid), generation id (genid) genid, and the letter 'R'.
*
* Example dictionary with a single key "/Length" that relies direct object for the value.
*
* 1 0 obj
* << /Length 534
* /Filter [ /ASCII85Decode /LZWDecode ]
* >>
* stream
* J..)6T`?p&<!J9%_[umg"B7/Z7KNXbN'S+,*Q/&"OLT'FLIDK#!n`$"<Atdi`\Vn%b%)&'cA*VnK\CJY(sF>c!Jnl@
* RM]WM;jjH6Gnc75idkL5]+cPZKEBPWdR>FF(kj1_R%W_d&/jS!;iuad7h?[L-F$+]]0A3Ck*$I0KZ?;<)CJtqi65Xb
* Vc3\n5ua:Q/=0$W<#N3U;H,MQKqfg1?:lUpR;6oN[C2E4ZNr8Udn.'p+?#X+1>0Kuk$bCDF/(3fL5]Oq)^kJZ!C2H1
* 'TO]Rl?Q:&'<5&iP!$Rq;BXRecDN[IJB`,)o8XJOSJ9sDS]hQ;Rj@!ND)bD_q&C\g:inYC%)&u#:u,M6Bm%IY!Kb1+
* ":aAa'S`ViJglLb8<W9k6Yl\\0McJQkDeLWdPN?9A'jX*al>iG1p&i;eVoK&juJHs9%;Xomop"5KatWRT"JQ#qYuL,
* JD?M$0QP)lKn06l1apKDC@\qJ4B!!(5m+j.7F790m(Vj88l8Q:_CZ(Gm1%X\N1&u!FKHMB~>
* endstream
* endobj
*
* Example dictionary with a single key "/Length" that relies on an indirect object for the value.
*
* 7 0 obj
* << /Length 8 0 R >> % An indirect reference to object 8, with generation id 0.
* stream
* BT
* /F1 12 Tf
* 72 712 Td
* ( A stream with an indirect length ) Tj
* ET
* endstream
* endobj
*
* 8 0 obj
* 77 % The length of the preceding stream
* endobj
*
* @param pdf Pdf context structure.
* @param obj Pdf object context structure.
* @param start Pointer start of the dictionary string.
* @param len Remaining length of the dictioary string in bytes.
* @return size_t Unsigned integer value of the "/Length" key
*/
static size_t find_length(struct pdf_struct *pdf, struct pdf_obj *obj, const char *dict_start, size_t dict_len)
{
size_t length = 0;
const char *obj_start = dict_start;
size_t bytes_remaining = dict_len;
long temp_long = 0;
const char *index;
if (bytes_remaining < 8) {
return 0;
}
/*
* Find the "/Length" dictionary key
*/
index = cli_memstr(obj_start, bytes_remaining, "/Length", 7);
if (!index)
return 0;
bytes_remaining -= index - obj_start;
if (bytes_remaining < 1) {
return 0;
}
/* Step the index into the "/Length" string. */
index++;
bytes_remaining--;
/* Find the start of the next direct or indirect object.
* pdf_nextobject() assumes we started searching from within a previous object */
obj_start = pdf_nextobject(index, bytes_remaining);
if (!obj_start)
return 0;
if (bytes_remaining < (size_t)(obj_start - index)) {
return 0;
}
bytes_remaining -= obj_start - index;
index = obj_start;
/* Read the value. This could either be the direct length value,
or the object id of the indirect object that has the length */
if (CL_SUCCESS != cli_strntol_wrap(index, bytes_remaining, 0, 10, &temp_long)) {
cli_dbgmsg("find_length: failed to parse object length or objid\n");
return 0;
} else if (temp_long < 0) {
cli_dbgmsg("find_length: Encountered invalid negative object length or objid (%ld).\n", temp_long);
return 0;
}
length = (size_t)temp_long; /* length or maybe object id */
/*
* Keep parsing, skipping past the first integer that might have been what we wanted.
* If it's an indirect object, we'll find a Generation ID followed by the letter 'R'
* I.e. something like " 0 R"
*/
while ((bytes_remaining > 0) && isdigit(*index)) {
index++;
bytes_remaining--;
}
if ((bytes_remaining > 0) && (*index == ' ')) {
unsigned long genid;
index++;
bytes_remaining--;
if (CL_SUCCESS != cli_strntol_wrap(index, bytes_remaining, 0, 10, &temp_long)) {
cli_dbgmsg("find_length: failed to parse object genid\n");
return 0;
} else if (temp_long < 0) {
cli_dbgmsg("find_length: Encountered invalid negative object genid (%ld).\n", temp_long);
return 0;
}
genid = (unsigned long)temp_long;
while ((bytes_remaining > 0) && isdigit(*index)) {
index++;
bytes_remaining--;
}
if (bytes_remaining < 2) {
return 0;
}
if (index[0] == ' ' && index[1] == 'R') {
/*
* Ok so we found a genid and that 'R'. Which means that first value
* was actually the objid.
* We can look up the indirect object using this information.
*/
unsigned long objid = length;
const char *indirect_obj_start = NULL;
cli_dbgmsg("find_length: length is in indirect object %lu %lu\n", objid, genid);
obj = find_obj(pdf, obj, (length << 8) | (genid & 0xff));
if (!obj) {
cli_dbgmsg("find_length: indirect object not found\n");
return 0;
}
if (NULL == obj->objstm) {
indirect_obj_start = (const char *)(obj->start + pdf->map);
if (!CLI_ISCONTAINED(pdf->map, pdf->size, indirect_obj_start, obj->size)) {
cli_dbgmsg("find_length: indirect object found, but not contained in PDF\n");
return 0;
}
bytes_remaining = pdf->size - obj->start;
} else {
indirect_obj_start = (const char *)(obj->start + obj->objstm->streambuf);
if (!CLI_ISCONTAINED(obj->objstm->streambuf, obj->objstm->streambuf_len, indirect_obj_start, obj->size)) {
cli_dbgmsg("find_length: indirect object found, but not contained in PDF streambuf\n");
return 0;
}
bytes_remaining = obj->objstm->streambuf_len - obj->start;
}
/* Ok so we found the indirect object, lets read the value. */
index = pdf_nextobject(indirect_obj_start, bytes_remaining);
if (!index) {
cli_dbgmsg("find_length: next object not found\n");
return 0;
}
if (bytes_remaining < (size_t)(index - indirect_obj_start)) {
return 0;
}
bytes_remaining -= index - indirect_obj_start;
/* Found the value, so lets parse it as a long, but prohibit negative lengths. */
if (CL_SUCCESS != cli_strntol_wrap(index, bytes_remaining, 0, 10, &temp_long)) {
cli_dbgmsg("find_length: failed to parse object length from indirect object\n");
return 0;
} else if (temp_long < 0) {
cli_dbgmsg("find_length: Encountered invalid negative obj length (%ld).\n", temp_long);
return 0;
}
length = (size_t)temp_long;
}
}
/* limit length */
if ((size_t)(obj_start - pdf->map) + length + 5 > pdf->size)
length = pdf->size - (obj_start - pdf->map) - 5;
return length;
}
#define DUMP_MASK ((1 << OBJ_CONTENTS) | (1 << OBJ_FILTER_FLATE) | (1 << OBJ_FILTER_DCT) | (1 << OBJ_FILTER_AH) | (1 << OBJ_FILTER_A85) | (1 << OBJ_EMBEDDED_FILE) | (1 << OBJ_JAVASCRIPT) | (1 << OBJ_OPENACTION) | (1 << OBJ_LAUNCHACTION))
static int run_pdf_hooks(struct pdf_struct *pdf, enum pdf_phase phase, int fd, const char *filepath)
{
int ret;
struct cli_bc_ctx *bc_ctx;
cli_ctx *ctx = NULL;
fmap_t *map;
if (NULL == pdf)
return CL_EARG;
ctx = pdf->ctx;
bc_ctx = cli_bytecode_context_alloc();
if (!bc_ctx) {
cli_errmsg("run_pdf_hooks: can't allocate memory for bc_ctx\n");
return CL_EMEM;
}
map = ctx->fmap;
if (fd != -1) {
/* The fmap in this bytecode context is an extracted pdf object. */
map = fmap_new(fd, 0, 0, NULL, filepath);
if (!map) {
cli_dbgmsg("run_pdf_hooks: can't mmap pdf extracted obj\n");
map = ctx->fmap;
fd = -1;
}
}
cli_bytecode_context_setpdf(bc_ctx, phase, pdf->nobjs, pdf->objs, &pdf->flags, pdf->size, pdf->startoff);
cli_bytecode_context_setctx(bc_ctx, ctx);
ret = cli_bytecode_runhook(ctx, ctx->engine, bc_ctx, BC_PDF, map);
cli_bytecode_context_destroy(bc_ctx);
if (fd != -1)
fmap_free(map);
return ret;
}
static void dbg_printhex(const char *msg, const char *hex, unsigned len);
static void aes_256cbc_decrypt(const unsigned char *in, size_t *length, unsigned char *q, char *key, unsigned key_n, int has_iv)
{
uint32_t rk[RKLENGTH(256)];
unsigned char iv[16];
size_t len = 0;
unsigned char pad, i;
int nrounds;
if (in == NULL || length == NULL) {
cli_dbgmsg("aes_256cbc_decrypt: invalid NULL parameters!\n");
noisy_warnmsg("aes_256cbc_decrypt: invalid NULL parameters!\n");
return;
}
len = *length;
cli_dbgmsg("aes_256cbc_decrypt: key length: %d, data length: %zu\n", key_n, *length);
if (!(key_n == 16 || key_n == 24 || key_n == 32)) {
cli_dbgmsg("aes_256cbc_decrypt: invalid key length: %u!\n", key_n * 8);
noisy_warnmsg("aes_256cbc_decrypt: invalid key length: %u!\n", key_n * 8);
return;
}
if (len < 32) {
cli_dbgmsg("aes_256cbc_decrypt: len is <32: %zu\n", len);
noisy_warnmsg("aes_256cbc_decrypt: len is <32: %zu\n", len);
return;
}
if (has_iv) {
memcpy(iv, in, 16);
in += 16;
len -= 16;
} else {
memset(iv, 0, sizeof(iv));
}
cli_dbgmsg("aes_256cbc_decrypt: Calling rijndaelSetupDecrypt\n");
nrounds = rijndaelSetupDecrypt(rk, (const unsigned char *)key, key_n * 8);
if (!nrounds) {
cli_dbgmsg("aes_256cbc_decrypt: nrounds = 0\n");
return;
}
cli_dbgmsg("aes_256cbc_decrypt: Beginning rijndaelDecrypt\n");
while (len >= 16) {
unsigned i;
rijndaelDecrypt(rk, nrounds, in, q);
for (i = 0; i < 16; i++)
q[i] ^= iv[i];
memcpy(iv, in, 16);
q += 16;
in += 16;
len -= 16;
}
if (has_iv) {
len += 16;
pad = q[-1];
if (pad > 0x10) {
cli_dbgmsg("aes_256cbc_decrypt: bad pad: %x (extra len: %zu)\n", pad, len - 16);
noisy_warnmsg("aes_256cbc_decrypt: bad pad: %x (extra len: %zu)\n", pad, len - 16);
*length -= len;
return;
}
q -= pad;
for (i = 1; i < pad; i++) {
if (q[i] != pad) {
cli_dbgmsg("aes_256cbc_decrypt: bad pad: %x != %x\n", q[i], pad);
noisy_warnmsg("aes_256cbc_decrypt: bad pad: %x != %x\n", q[i], pad);
*length -= len;
return;
}
}
len += pad;
}
*length -= len;
cli_dbgmsg("aes_256cbc_decrypt: length is %zu\n", *length);
}
static void aes_128cbc_encrypt(const unsigned char *in, size_t in_length, unsigned char *out, size_t *out_length, const unsigned char *key, size_t key_n, const unsigned char *iv)
{
uint32_t rk[RKLENGTH(128)];
unsigned char real_iv[16] = {0};
int nrounds;
uint8_t i = 0;
cli_dbgmsg("cli_pdf: aes_128cbc_encrypt: key length: %zu, data length: %zu\n", key_n, in_length);
if (key_n > 16) {
cli_dbgmsg("cli_pdf: aes_128cbc_encrypt: key length is %zu!\n", key_n * 8);
return;
}
if (in_length < 16) {
cli_dbgmsg("cli_pdf: aes_128cbc_encrypt: in_length is <16: %zu\n", in_length);
noisy_warnmsg("cli_pdf: aes_128cbc_encrypt: in_length is <16: %zu\n", in_length);
return;
}
cli_dbgmsg("aes_128cbc_encrypt: Calling rijndaelSetupEncrypt\n");
nrounds = rijndaelSetupEncrypt(rk, key, key_n * 8);
if (!nrounds) {
cli_dbgmsg("cli_pdf: aes_128cbc_encrypt: nrounds = 0\n");
return;
}
cli_dbgmsg("aes_128cbc_encrypt: Beginning rijndaelEncrypt\n");
if (iv)
memcpy(real_iv, iv, sizeof(real_iv));
*out_length = 0;
while (in_length >= 16) {
for (i = 0; i < 16; i++)
real_iv[i] ^= in[i];
rijndaelEncrypt(rk, nrounds, real_iv, real_iv);
for (i = 0; i < 16; i++)
out[i] = real_iv[i];
out += 16;
*out_length += 16;
in += 16;
in_length -= 16;
}
cli_dbgmsg("cli_pdf: aes_128cbc_encrypt: length is %zu\n", *out_length);
}
char *decrypt_any(struct pdf_struct *pdf, uint32_t id, const char *in, size_t *length, enum enc_method enc_method)
{
unsigned char *key, *q, result[16];
unsigned n;
struct arc4_state arc4;
if (!length || !*length || !in) {
noisy_warnmsg("decrypt_any: decrypt failed for obj %u %u: Invalid arguments.\n", id >> 8, id & 0xff);
return NULL;
}
if (NULL == pdf->key || 0 == pdf->keylen) {
noisy_warnmsg("decrypt_any: decrypt failed for obj %u %u: PDF key never identified.\n", id >> 8, id & 0xff);
return NULL;
}
n = pdf->keylen + 5;
if (enc_method == ENC_AESV2)
n += 4;
key = cli_max_malloc(n);
if (!key) {
noisy_warnmsg("decrypt_any: malloc failed\n");
return NULL;
}
memcpy(key, pdf->key, pdf->keylen);
q = key + pdf->keylen;
*q++ = id >> 8;
*q++ = id >> 16;
*q++ = id >> 24;
*q++ = id;
*q++ = 0;
if (enc_method == ENC_AESV2)
memcpy(q, "sAlT", 4);
cl_hash_data("md5", key, n, result, NULL);
free(key);
n = pdf->keylen + 5;
if (n > 16)
n = 16;
q = cli_max_calloc(*length, sizeof(char));
if (!q) {
noisy_warnmsg("decrypt_any: malloc failed\n");
return NULL;
}
switch (enc_method) {
case ENC_V2:
cli_dbgmsg("cli_pdf: enc is v2\n");
memcpy(q, in, *length);
if (false == arc4_init(&arc4, result, n)) {
noisy_warnmsg("decrypt_any: failed to init arc4\n");
free(q);
return NULL;
}
arc4_apply(&arc4, q, (unsigned)*length); /* TODO: may truncate for very large lengths */
noisy_msg(pdf, "decrypt_any: decrypted ARC4 data\n");
break;
case ENC_AESV2:
cli_dbgmsg("cli_pdf: enc is aesv2\n");
aes_256cbc_decrypt((const unsigned char *)in, length, q, (char *)result, n, 1);
noisy_msg(pdf, "decrypt_any: decrypted AES(v2) data\n");
break;
case ENC_AESV3:
cli_dbgmsg("decrypt_any: enc is aesv3\n");
aes_256cbc_decrypt((const unsigned char *)in, length, q, pdf->key, pdf->keylen, 1);
noisy_msg(pdf, "decrypted AES(v3) data\n");
break;
case ENC_IDENTITY:
cli_dbgmsg("decrypt_any: enc is identity\n");
memcpy(q, in, *length);
noisy_msg(pdf, "decrypt_any: identity encryption\n");
break;
case ENC_NONE:
cli_dbgmsg("decrypt_any: enc is none\n");
noisy_msg(pdf, "encryption is none\n");
free(q);
return NULL;
case ENC_UNKNOWN:
cli_dbgmsg("decrypt_any: enc is unknown\n");
free(q);
noisy_warnmsg("decrypt_any: unknown encryption method for obj %u %u\n",
id >> 8, id & 0xff);
return NULL;
}
return (char *)q;
}
enum enc_method get_enc_method(struct pdf_struct *pdf, struct pdf_obj *obj)
{
if (obj->flags & (1 << OBJ_EMBEDDED_FILE))
return pdf->enc_method_embeddedfile;
if (obj->flags & (1 << OBJ_STREAM))
return pdf->enc_method_stream;
return pdf->enc_method_string;
}
enum cstate {
CSTATE_NONE,
CSTATE_TJ,
CSTATE_TJ_PAROPEN
};
static void process(struct text_norm_state *s, enum cstate *st, const char *buf, size_t length, int fout)
{
do {
switch (*st) {
case CSTATE_NONE:
if (*buf == '[') {
*st = CSTATE_TJ;
} else {
const char *nl = memchr(buf, '\n', length);
if (!nl)
return;
if ((size_t)(nl - buf) > length) {
length = 0;
} else {
length -= nl - buf;
}
buf = nl;
}
break;
case CSTATE_TJ:
if (*buf == '(')
*st = CSTATE_TJ_PAROPEN;
break;
case CSTATE_TJ_PAROPEN:
if (*buf == ')') {
*st = CSTATE_TJ;
} else {
if (text_normalize_buffer(s, (const unsigned char *)buf, 1) != 1) {
cli_writen(fout, s->out, s->out_pos);
text_normalize_reset(s);
}
}
break;
}
buf++;
if (length > 0)
length--;
} while (length > 0);
}
static int pdf_scan_contents(int fd, struct pdf_struct *pdf, struct pdf_obj *obj)
{
struct text_norm_state s;
char fullname[1024];
char outbuff[BUFSIZ];
char inbuf[BUFSIZ];
int fout;
size_t n;
cl_error_t rc;
enum cstate st = CSTATE_NONE;
snprintf(fullname, sizeof(fullname), "%s" PATHSEP "pdf obj %d %d contents", pdf->dir, obj->id >> 8, obj->id & 0xff);
fout = open(fullname, O_RDWR | O_CREAT | O_EXCL | O_TRUNC | O_BINARY, 0600);
if (fout < 0) {
char err[128];
cli_errmsg("pdf_scan_contents: can't create temporary file %s: %s\n", fullname, cli_strerror(errno, err, sizeof(err)));
return CL_ETMPFILE;
}
text_normalize_init(&s, (unsigned char *)outbuff, sizeof(outbuff));
while (1) {
n = cli_readn(fd, inbuf, sizeof(inbuf));
if ((n == 0) || (n == (size_t)-1))
break;
process(&s, &st, inbuf, n, fout);
}
cli_writen(fout, s.out, s.out_pos);
lseek(fout, 0, SEEK_SET);
rc = cli_magic_scan_desc(fout, fullname, pdf->ctx, NULL, LAYER_ATTRIBUTES_NONE);
close(fout);
if (!pdf->ctx->engine->keeptmp || (s.out_pos == 0))
if (cli_unlink(fullname) && rc != CL_VIRUS)
rc = CL_EUNLINK;
return rc;
}
cl_error_t pdf_extract_obj(struct pdf_struct *pdf, struct pdf_obj *obj, uint32_t flags)
{
cl_error_t status = CL_SUCCESS;
cl_error_t ret;
char fullname[PATH_MAX + 1];
bool extracted_an_object = false;
int fout = -1;
size_t sum = 0;
bool dump = true;
struct pdf_dict *dparams = NULL;
cli_dbgmsg("pdf_extract_obj: obj %u %u\n", obj->id >> 8, obj->id & 0xff);
if (PDF_OBJECT_RECURSION_LIMIT < pdf->parse_recursion_depth) {
cli_dbgmsg("pdf_extract_obj: Recursion limit reached.\n");
status = CL_SUCCESS;
goto done;
}
if (obj->extracted) {
// Should not attempt to extract the same object more than once.
status = CL_SUCCESS;
goto done;
}
// We're not done yet, but this is enough to say we've tried.
// Trying again won't help any.
obj->extracted = true;
if (obj->objstm) {
cli_dbgmsg("pdf_extract_obj: extracting obj found in objstm.\n");
if (obj->objstm->streambuf == NULL) {
cli_warnmsg("pdf_extract_obj: object in object stream has null stream buffer!\n");
status = CL_EFORMAT;
goto done;
}
}
/* Check to see if this is a URI referenced from a prior URI object */
if (obj->flags & (1 << OBJ_URI)) {
URI_cb(pdf, obj, NULL);
status = CL_SUCCESS;
goto done;
}
/* TODO: call bytecode hook here, allow override dumpability */
if ((!(obj->flags & (1 << OBJ_STREAM)) || (obj->flags & (1 << OBJ_HASFILTERS))) && !(obj->flags & DUMP_MASK)) {
/* don't dump all streams */
dump = false;
}
if ((obj->flags & (1 << OBJ_IMAGE)) && !(obj->flags & (1 << OBJ_FILTER_DCT))) {
/* don't dump / scan non-JPG images */
dump = false;
}
if (obj->flags & (1 << OBJ_FORCEDUMP)) {
/* bytecode can force dump by setting this flag */
dump = true;
}
if (!dump) {
status = CL_SUCCESS;
goto done;
}
cli_dbgmsg("pdf_extract_obj: dumping obj %u %u\n", obj->id >> 8, obj->id & 0xff);
snprintf(fullname, sizeof(fullname), "%s" PATHSEP "pdf obj %d %d", pdf->dir, obj->id >> 8, obj->id & 0xff);
fout = open(fullname, O_RDWR | O_CREAT | O_EXCL | O_TRUNC | O_BINARY, 0600);
if (fout < 0) {
char err[128];
cli_errmsg("pdf_extract_obj: can't create temporary file %s: %s\n", fullname, cli_strerror(errno, err, sizeof(err)));
status = CL_ETMPFILE;
goto done;
}
extracted_an_object = true;
if (!(flags & PDF_EXTRACT_OBJ_SCAN)) {
/*
* When PDF_EXTRACT_OBJ_SCAN is not set, this function is used to extract the object to a temp file
* and so we need to save off the path in obj->path for the caller to use.
*/
if (NULL != obj->path) {
obj->path = strdup(fullname);
}
}
if ((NULL == obj->objstm) &&
(obj->flags & (1 << OBJ_STREAM))) {
/*
* Object contains a stream. Parse this now.
*/
cli_dbgmsg("pdf_extract_obj: parsing a stream in obj %u %u\n", obj->id >> 8, obj->id & 0xff);
const char *start = pdf->map + obj->start;
size_t length;
size_t orig_length;
int dict_len = obj->stream - start; /* Dictionary should end where the stream begins */
const char *pstr;
struct objstm_struct *objstm = NULL;
int xref = 0;
/* Find and interpret the length dictionary value */
length = find_length(pdf, obj, start, dict_len);
orig_length = length;
if (length > obj->stream_size) {
cli_dbgmsg("cli_pdf: Stream length exceeds object length by %zu bytes. Length truncated to %zu bytes\n", length - obj->stream_size, obj->stream_size);
noisy_warnmsg("Stream length exceeds object length by %zu bytes. Length truncated to %zu bytes\n", length - obj->stream_size, obj->stream_size);
length = obj->stream_size;
}
if (!(obj->flags & (1 << OBJ_FILTER_FLATE)) && (length == 0)) {
/*
* If the length is unknown and this doesn't contain a FLATE encoded filter...
* Calculate the length using the stream size, and trimming
* off any newline/carriage returns from the end of the stream.
*/
const char *q = start + obj->stream_size;
length = obj->stream_size;
q--;
if (length > 0) {
if (*q == '\n') {
q--;
length--;
if (length > 0 && *q == '\r')
length--;
} else if (*q == '\r') {
length--;
}
}
cli_dbgmsg("pdf_extract_obj: calculated length %zu\n", length);
} else {
if (obj->stream_size > (size_t)length + 2) {
cli_dbgmsg("cli_pdf: calculated length %zu < %zu\n",
length, obj->stream_size);
length = obj->stream_size;
}
}
if ((0 != orig_length) && (obj->stream_size > orig_length + 20)) {
cli_dbgmsg("pdf_extract_obj: orig length: %zu, length: %zu, size: %zu\n",
orig_length, length, obj->stream_size);
pdfobj_flag(pdf, obj, BAD_STREAMLEN);
}
if (0 == length) {
length = obj->stream_size;
if (0 == length) {
cli_dbgmsg("pdf_extract_obj: Alleged or calculated stream length and stream buffer size both 0\n");
/* Empty stream, nothing to scan */
status = CL_SUCCESS;
goto done;
}
}
/* Check if XRef is enabled */
if (cli_memstr(start, dict_len, "/XRef", strlen("/XRef"))) {
xref = 1;
}
/*
* Identify the DecodeParms, if available.
*/
if (NULL != (pstr = pdf_getdict(start, &dict_len, "/DecodeParms"))) {
cli_dbgmsg("pdf_extract_obj: Found /DecodeParms\n");
} else if (NULL != (pstr = pdf_getdict(start, &dict_len, "/DP"))) {
cli_dbgmsg("pdf_extract_obj: Found /DP\n");
}
if (pstr) {
/* shift pstr left to "<<" for pdf_parse_dict */
while ((*pstr == '<') && (pstr > start)) {
pstr--;
dict_len++;
}
/* shift pstr right to "<<" for pdf_parse_dict */
while ((*pstr != '<') && (dict_len > 0)) {
pstr++;
dict_len--;
}
if (dict_len > 4) {
pdf->parse_recursion_depth++;
dparams = pdf_parse_dict(pdf, obj, obj->size, (char *)pstr, NULL);
pdf->parse_recursion_depth--;
} else {
cli_dbgmsg("pdf_extract_obj: failed to locate DecodeParms dictionary start\n");
}
}
/*
* Go back to the start of the dictionary and check to see if the stream
* is an object stream. If so, collect the relevant info.
*/
dict_len = obj->stream - start;
if (NULL != (pstr = pdf_getdict(start, &dict_len, "/Type/ObjStm"))) {
int objstm_first = -1;
int objstm_length = -1;
int objstm_n = -1;
cli_dbgmsg("pdf_extract_obj: Found /Type/ObjStm\n");
dict_len = obj->stream - start;
if (-1 == (objstm_first = pdf_readint(start, dict_len, "/First"))) {
cli_warnmsg("pdf_extract_obj: Failed to find offset of first object in object stream\n");
} else if (-1 == (objstm_length = pdf_readint(start, dict_len, "/Length"))) {
cli_warnmsg("pdf_extract_obj: Failed to find length of object stream\n");
} else if (-1 == (objstm_n = pdf_readint(start, dict_len, "/N"))) {
cli_warnmsg("pdf_extract_obj: Failed to find num objects in object stream\n");
} else {
/* Add objstm to pdf struct, so it can be freed eventually */
pdf->nobjstms++;
pdf->objstms = cli_max_realloc_or_free(pdf->objstms, sizeof(struct objstm_struct *) * pdf->nobjstms);
if (!pdf->objstms) {
cli_warnmsg("pdf_extract_obj: out of memory parsing object stream (%u)\n", pdf->nobjstms);
status = CL_EMEM;
goto done;
}
CLI_CALLOC_OR_GOTO_DONE(
objstm, 1, sizeof(struct objstm_struct),
cli_warnmsg("pdf_extract_obj: out of memory parsing object stream (%u)\n", pdf->nobjstms),
status = CL_EMEM);
pdf->objstms[pdf->nobjstms - 1] = objstm;
objstm->first = (size_t)objstm_first;
objstm->current = (size_t)objstm_first;
objstm->current_pair = 0;
objstm->length = (size_t)objstm_length;
objstm->n = (size_t)objstm_n;
cli_dbgmsg("pdf_extract_obj: ObjStm first obj at offset %zu\n", objstm->first);
cli_dbgmsg("pdf_extract_obj: ObjStm length is %zu bytes\n", objstm->length);
cli_dbgmsg("pdf_extract_obj: ObjStm should contain %zu objects\n", objstm->n);
}
}
sum = pdf_decodestream(pdf, obj, dparams, obj->stream, (uint32_t)length, xref, fout, &status, objstm);
if ((CL_SUCCESS != status) && (CL_VIRUS != status)) {
cli_dbgmsg("Error decoding stream! Error code: %d\n", status);
/* It's ok if we couldn't decode the stream,
* make a best effort to keep parsing...
* Unless we were unable to allocate memory.*/
if (CL_EMEM == status) {
goto done;
}
if (CL_EPARSE == status) {
status = CL_SUCCESS;
}
if (NULL != objstm) {
/*
* If we were expecting an objstm and there was a failure...
* discard the memory for last object stream.
*/
if (NULL != pdf->objstms) {
if (NULL != pdf->objstms[pdf->nobjstms - 1]) {
if (NULL != pdf->objstms[pdf->nobjstms - 1]->streambuf) {
free(pdf->objstms[pdf->nobjstms - 1]->streambuf);
pdf->objstms[pdf->nobjstms - 1]->streambuf = NULL;
}
free(pdf->objstms[pdf->nobjstms - 1]);
pdf->objstms[pdf->nobjstms - 1] = NULL;
}
/* Pop the objstm off the end of the pdf->objstms array. */
if (pdf->nobjstms > 0) {
pdf->nobjstms--;
if (0 == pdf->nobjstms) {
free(pdf->objstms);
pdf->objstms = NULL;
} else {
pdf->objstms = cli_max_realloc_or_free(pdf->objstms, sizeof(struct objstm_struct *) * pdf->nobjstms);
if (!pdf->objstms) {
cli_warnmsg("pdf_extract_obj: out of memory when shrinking down objstm array\n");
status = CL_EMEM;
goto done;
}
}
} else {
/* hm.. this shouldn't happen */
cli_warnmsg("pdf_extract_obj: Failure counting objstms.\n");
}
}
}
}
if (dparams) {
pdf_free_dict(dparams);
dparams = NULL;
}
if (status == CL_VIRUS) {
/* skip post-filter scan */
goto done;
}
} else if (obj->flags & (1 << OBJ_JAVASCRIPT)) {
const char *q2;
const char *q = (obj->objstm) ? (const char *)(obj->start + obj->objstm->streambuf)
: (const char *)(obj->start + pdf->map);
/* TODO: get obj-endobj size */
off_t bytesleft = obj->size;
if (bytesleft < 0) {
goto scan_extracted_objects;
}
do {
char *js = NULL;
size_t js_len = 0;
const char *q3;
q2 = cli_memstr(q, bytesleft, "/JavaScript", 11);
if (!q2)
break;
bytesleft -= q2 - q + 11;
q = q2 + 11;
js = pdf_readstring(q, bytesleft, "/JS", NULL, &q2, !(pdf->flags & (1 << DECRYPTABLE_PDF)));
bytesleft -= q2 - q;
q = q2;
if (js) {
char *decrypted = NULL;
const char *out = js;
js_len = strlen(js);
if (pdf->flags & (1 << DECRYPTABLE_PDF)) {
cli_dbgmsg("pdf_extract_obj: encrypted string\n");
decrypted = decrypt_any(pdf, obj->id, js, &js_len, pdf->enc_method_string);
if (decrypted) {
noisy_msg(pdf, "pdf_extract_obj: decrypted Javascript string from obj %u %u\n", obj->id >> 8, obj->id & 0xff);
out = decrypted;
}
}
if ((pdf->ctx->options->general & CL_SCAN_GENERAL_COLLECT_METADATA) && pdf->ctx->this_layer_metadata_json != NULL) {
struct json_object *pdfobj, *jbig2arr;
if (NULL == (pdfobj = cli_jsonobj(pdf->ctx->this_layer_metadata_json, "PDFStats"))) {
cli_errmsg("pdf_extract_obj: failed to get PDFStats JSON object\n");
} else if (NULL == (jbig2arr = cli_jsonarray(pdfobj, "JavascriptObjects"))) {
cli_errmsg("pdf_extract_obj: failed to get JavascriptObjects JSON object\n");
} else {
cli_jsonint_array(jbig2arr, obj->id >> 8);
}
}
pdf->stats.njs++;
if (filter_writen(pdf, obj, fout, out, js_len, (size_t *)&sum) != js_len) {
status = CL_EWRITE;
free(js);
break;
}
free(decrypted);
free(js);
cli_dbgmsg("pdf_extract_obj: bytesleft: %d\n", (int)bytesleft);
if (bytesleft > 0) {
q2 = pdf_nextobject(q, bytesleft);
if (!q2)
q2 = q + bytesleft - 1;
/* non-conforming PDFs that don't escape ) properly */
q3 = memchr(q, ')', bytesleft);
if (q3 && q3 < q2)
q2 = q3;
while (q2 > q && q2[-1] == ' ')
q2--;
if (q2 > q) {
q--;
filter_writen(pdf, obj, fout, q, q2 - q, (size_t *)&sum);
q++;
}
}
}
} while (bytesleft > 0);
} else {
off_t bytesleft = obj->size;
if (bytesleft < 0)
status = CL_EFORMAT;
else {
if (obj->objstm) {
if (filter_writen(pdf, obj, fout, obj->objstm->streambuf + obj->start, bytesleft, (size_t *)&sum) != (size_t)bytesleft) {
status = CL_EWRITE;
}
} else {
if (filter_writen(pdf, obj, fout, pdf->map + obj->start, bytesleft, (size_t *)&sum) != (size_t)bytesleft) {
status = CL_EWRITE;
}
}
}
}
scan_extracted_objects:
cli_dbgmsg("pdf_extract_obj: extracted %td bytes %u %u obj\n", sum, obj->id >> 8, obj->id & 0xff);
cli_dbgmsg("pdf_extract_obj: ... to %s\n", fullname);
if ((flags & PDF_EXTRACT_OBJ_SCAN) && (sum > 0)) {
/*
* Scan the extracted objects for potential threats.
* PDF_EXTRACT_OBJ_SCAN is used when the extracted object should be scanned and then deleted.
*/
/* TODO: invoke bytecode on this pdf obj with metainformation associated */
lseek(fout, 0, SEEK_SET);
ret = cli_magic_scan_desc(fout, fullname, pdf->ctx, NULL, LAYER_ATTRIBUTES_NONE);
if (ret != CL_SUCCESS) {
status = ret;
goto done;
}
if ((status == CL_CLEAN) || (status == CL_VIRUS)) {
ret = run_pdf_hooks(pdf, PDF_PHASE_POSTDUMP, fout, fullname);
if (ret == CL_VIRUS) {
status = ret;
goto done;
}
}
if (((status == CL_CLEAN) || (status == CL_VIRUS)) && (obj->flags & (1 << OBJ_CONTENTS))) {
lseek(fout, 0, SEEK_SET);
cli_dbgmsg("pdf_extract_obj: dumping contents from obj %u %u\n", obj->id >> 8, obj->id & 0xff);
ret = pdf_scan_contents(fout, pdf, obj);
if (ret != CL_SUCCESS) {
status = ret;
goto done;
}
}
}
done:
if (NULL != dparams) {
pdf_free_dict(dparams);
}
if (-1 != fout) {
close(fout);
}
if (extracted_an_object && (flags & PDF_EXTRACT_OBJ_SCAN) && !pdf->ctx->engine->keeptmp) {
/*
* When PDF_EXTRACT_OBJ_SCAN is set, the goal is to extract, scan, and delete it.
* If it was not set, we would keep it and the path is passed back obj->path for the caller to use.
* That's why we wouldn't unlink it here.
*/
if (cli_unlink(fullname) && status != CL_VIRUS) {
status = CL_EUNLINK;
}
}
return status;
}
enum objstate {
STATE_NONE,
STATE_S,
STATE_FILTER,
STATE_JAVASCRIPT,
STATE_OPENACTION,
STATE_LINEARIZED,
STATE_LAUNCHACTION,
STATE_CONTENTS,
STATE_URI,
STATE_ANY /* for actions table below */
};
#define NAMEFLAG_NONE 0x0
#define NAMEFLAG_HEURISTIC 0x1
struct pdfname_action {
const char *pdfname;
enum pdf_objflags set_objflag; /* OBJ_DICT is noop */
enum objstate from_state; /* STATE_NONE is noop */
enum objstate to_state;
uint32_t nameflags;
void (*pdf_stats_cb)(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfname_action *act);
};
static struct pdfname_action pdfname_actions[] = {
{"ASCIIHexDecode", OBJ_FILTER_AH, STATE_FILTER, STATE_FILTER, NAMEFLAG_HEURISTIC, ASCIIHexDecode_cb},
{"ASCII85Decode", OBJ_FILTER_A85, STATE_FILTER, STATE_FILTER, NAMEFLAG_HEURISTIC, ASCII85Decode_cb},
{"A85", OBJ_FILTER_A85, STATE_FILTER, STATE_FILTER, NAMEFLAG_HEURISTIC, ASCII85Decode_cb},
{"AHx", OBJ_FILTER_AH, STATE_FILTER, STATE_FILTER, NAMEFLAG_HEURISTIC, ASCIIHexDecode_cb},
{"EmbeddedFile", OBJ_EMBEDDED_FILE, STATE_NONE, STATE_NONE, NAMEFLAG_HEURISTIC, EmbeddedFile_cb},
{"FlateDecode", OBJ_FILTER_FLATE, STATE_FILTER, STATE_FILTER, NAMEFLAG_HEURISTIC, FlateDecode_cb},
{"Fl", OBJ_FILTER_FLATE, STATE_FILTER, STATE_FILTER, NAMEFLAG_HEURISTIC, FlateDecode_cb},
{"Image", OBJ_IMAGE, STATE_NONE, STATE_NONE, NAMEFLAG_HEURISTIC, Image_cb},
{"LZWDecode", OBJ_FILTER_LZW, STATE_FILTER, STATE_FILTER, NAMEFLAG_HEURISTIC, LZWDecode_cb},
{"LZW", OBJ_FILTER_LZW, STATE_FILTER, STATE_FILTER, NAMEFLAG_HEURISTIC, LZWDecode_cb},
{"RunLengthDecode", OBJ_FILTER_RL, STATE_FILTER, STATE_FILTER, NAMEFLAG_HEURISTIC, RunLengthDecode_cb},
{"RL", OBJ_FILTER_RL, STATE_FILTER, STATE_FILTER, NAMEFLAG_HEURISTIC, RunLengthDecode_cb},
{"CCITTFaxDecode", OBJ_FILTER_FAX, STATE_FILTER, STATE_FILTER, NAMEFLAG_HEURISTIC, CCITTFaxDecode_cb},
{"CCF", OBJ_FILTER_FAX, STATE_FILTER, STATE_FILTER, NAMEFLAG_HEURISTIC, CCITTFaxDecode_cb},
{"JBIG2Decode", OBJ_FILTER_DCT, STATE_FILTER, STATE_FILTER, NAMEFLAG_HEURISTIC, JBIG2Decode_cb},
{"DCTDecode", OBJ_FILTER_DCT, STATE_FILTER, STATE_FILTER, NAMEFLAG_HEURISTIC, DCTDecode_cb},
{"DCT", OBJ_FILTER_DCT, STATE_FILTER, STATE_FILTER, NAMEFLAG_HEURISTIC, DCTDecode_cb},
{"JPXDecode", OBJ_FILTER_JPX, STATE_FILTER, STATE_FILTER, NAMEFLAG_HEURISTIC, JPXDecode_cb},
{"Crypt", OBJ_FILTER_CRYPT, STATE_FILTER, STATE_NONE, NAMEFLAG_HEURISTIC, Crypt_cb},
{"Standard", OBJ_FILTER_STANDARD, STATE_FILTER, STATE_FILTER, NAMEFLAG_HEURISTIC, Standard_cb},
{"Sig", OBJ_SIGNED, STATE_ANY, STATE_NONE, NAMEFLAG_HEURISTIC, Sig_cb},
{"V", OBJ_SIGNED, STATE_ANY, STATE_NONE, NAMEFLAG_HEURISTIC, NULL},
{"R", OBJ_SIGNED, STATE_ANY, STATE_NONE, NAMEFLAG_HEURISTIC, NULL},
{"Linearized", OBJ_DICT, STATE_NONE, STATE_LINEARIZED, NAMEFLAG_HEURISTIC, NULL},
{"Filter", OBJ_HASFILTERS, STATE_ANY, STATE_FILTER, NAMEFLAG_HEURISTIC, NULL},
{"JavaScript", OBJ_JAVASCRIPT, STATE_ANY, STATE_JAVASCRIPT, NAMEFLAG_HEURISTIC, JavaScript_cb},
{"Length", OBJ_DICT, STATE_FILTER, STATE_NONE, NAMEFLAG_HEURISTIC, NULL},
{"S", OBJ_DICT, STATE_NONE, STATE_S, NAMEFLAG_HEURISTIC, NULL},
{"Type", OBJ_DICT, STATE_NONE, STATE_NONE, NAMEFLAG_HEURISTIC, NULL},
{"OpenAction", OBJ_OPENACTION, STATE_ANY, STATE_OPENACTION, NAMEFLAG_HEURISTIC, OpenAction_cb},
{"Launch", OBJ_LAUNCHACTION, STATE_ANY, STATE_LAUNCHACTION, NAMEFLAG_HEURISTIC, Launch_cb},
{"Page", OBJ_PAGE, STATE_NONE, STATE_NONE, NAMEFLAG_HEURISTIC, Page_cb},
{"Contents", OBJ_CONTENTS, STATE_NONE, STATE_CONTENTS, NAMEFLAG_HEURISTIC, NULL},
{"Author", OBJ_DICT, STATE_NONE, STATE_NONE, NAMEFLAG_NONE, Author_cb},
{"Producer", OBJ_DICT, STATE_NONE, STATE_NONE, NAMEFLAG_NONE, Producer_cb},
{"CreationDate", OBJ_DICT, STATE_NONE, STATE_NONE, NAMEFLAG_NONE, CreationDate_cb},
{"ModDate", OBJ_DICT, STATE_NONE, STATE_NONE, NAMEFLAG_NONE, ModificationDate_cb},
{"Creator", OBJ_DICT, STATE_NONE, STATE_NONE, NAMEFLAG_NONE, Creator_cb},
{"Title", OBJ_DICT, STATE_NONE, STATE_NONE, NAMEFLAG_NONE, Title_cb},
{"Keywords", OBJ_DICT, STATE_NONE, STATE_NONE, NAMEFLAG_NONE, Keywords_cb},
{"Subject", OBJ_DICT, STATE_NONE, STATE_NONE, NAMEFLAG_NONE, Subject_cb},
{"Pages", OBJ_DICT, STATE_NONE, STATE_NONE, NAMEFLAG_NONE, Pages_cb},
{"Colors", OBJ_DICT, STATE_NONE, STATE_NONE, NAMEFLAG_NONE, Colors_cb},
{"RichMedia", OBJ_DICT, STATE_NONE, STATE_NONE, NAMEFLAG_NONE, RichMedia_cb},
{"AcroForm", OBJ_DICT, STATE_NONE, STATE_NONE, NAMEFLAG_NONE, AcroForm_cb},
{"XFA", OBJ_DICT, STATE_NONE, STATE_NONE, NAMEFLAG_NONE, XFA_cb},
{"URI", OBJ_DICT, STATE_NONE, STATE_URI, NAMEFLAG_NONE, URI_cb}};
#define KNOWN_FILTERS ((1 << OBJ_FILTER_AH) | (1 << OBJ_FILTER_RL) | (1 << OBJ_FILTER_A85) | (1 << OBJ_FILTER_FLATE) | (1 << OBJ_FILTER_LZW) | (1 << OBJ_FILTER_FAX) | (1 << OBJ_FILTER_DCT) | (1 << OBJ_FILTER_JPX) | (1 << OBJ_FILTER_CRYPT))
static void handle_pdfname(struct pdf_struct *pdf, struct pdf_obj *obj, const char *pdfname, int escapes, enum objstate *state)
{
struct pdfname_action *act = NULL;
unsigned j;
// If we process STATE_S we will get duplicate URIs from the prior STATE_NONE
if (!strcmp(pdfname, "URI") && *state == STATE_S) {
*state = STATE_NONE;
return;
}
obj->statsflags |= OBJ_FLAG_PDFNAME_DONE;
// Check to see if this object was observed to be a reference to a URI
if (obj->flags & (1 << OBJ_URI)) {
// Forcing URI here, so we run the pdf_stats_cb for a URI.
pdfname = "URI";
}
for (j = 0; j < sizeof(pdfname_actions) / sizeof(pdfname_actions[0]); j++) {
if (!strcmp(pdfname, pdfname_actions[j].pdfname)) {
act = &pdfname_actions[j];
break;
}
}
if (!act) {
/* these are digital signature objects, filter doesn't matter,
* we don't need them anyway */
if (*state == STATE_FILTER && !(obj->flags & (1 << OBJ_SIGNED)) && !(obj->flags & KNOWN_FILTERS)) {
cli_dbgmsg("handle_pdfname: unknown filter %s\n", pdfname);
obj->flags |= 1 << OBJ_FILTER_UNKNOWN;
}
return;
}
/* record filter order */
if (obj->numfilters < PDF_FILTERLIST_MAX && (*state == STATE_FILTER) && ((1 << act->set_objflag) & KNOWN_FILTERS))
obj->filterlist[obj->numfilters++] = act->set_objflag;
if ((act->nameflags & NAMEFLAG_HEURISTIC) && escapes) {
/* if a commonly used PDF name is escaped that is certainly
suspicious. */
cli_dbgmsg("handle_pdfname: pdfname %s is escaped\n", pdfname);
pdfobj_flag(pdf, obj, ESCAPED_COMMON_PDFNAME);
}
if ((act->pdf_stats_cb))
act->pdf_stats_cb(pdf, obj, act);
if (act->from_state == *state || act->from_state == STATE_ANY) {
*state = act->to_state;
if (*state == STATE_FILTER && act->set_objflag != OBJ_DICT && (obj->flags & (1 << act->set_objflag))) {
cli_dbgmsg("handle_pdfname: duplicate stream filter %s\n", pdfname);
pdfobj_flag(pdf, obj, BAD_STREAM_FILTERS);
}
obj->flags |= 1 << act->set_objflag;
} else {
/* auto-reset states */
switch (*state) {
case STATE_S:
*state = STATE_NONE;
break;
default:
break;
}
}
}
static void pdf_parse_encrypt(struct pdf_struct *pdf, const char *enc, int len)
{
const char *q, *q2;
unsigned long objid;
unsigned long genid;
long temp_long;
if (len >= 16 && !strncmp(enc, "/EncryptMetadata", 16)) {
q = cli_memstr(enc + 16, len - 16, "/Encrypt", 8);
if (!q)
return;
len -= q - enc;
enc = q;
}
q = enc + 8;
len -= 8;
q2 = pdf_nextobject(q, len);
if (!q2 || !isdigit(*q2))
return;
len -= q2 - q;
q = q2;
if (CL_SUCCESS != cli_strntol_wrap(q2, (size_t)len, 0, 10, &temp_long)) {
cli_dbgmsg("pdf_parse_encrypt: Found Encrypt dictionary but failed to parse objid\n");
return;
} else if (temp_long < 0) {
cli_dbgmsg("pdf_parse_encrypt: Encountered invalid negative objid (%ld).\n", temp_long);
return;
}
objid = (unsigned long)temp_long;
objid = objid << 8;
q2 = pdf_nextobject(q, len);
if (!q2 || !isdigit(*q2))
return;
len -= q2 - q;
q = q2;
if (CL_SUCCESS != cli_strntol_wrap(q2, (size_t)len, 0, 10, &temp_long)) {
cli_dbgmsg("pdf_parse_encrypt: Found Encrypt dictionary but failed to parse genid\n");
return;
} else if (temp_long < 0) {
cli_dbgmsg("pdf_parse_encrypt: Encountered invalid negative genid (%ld).\n", temp_long);
return;
}
genid = (unsigned long)temp_long;
objid |= genid & 0xff;
q2 = pdf_nextobject(q, len);
if (!q2 || *q2 != 'R')
return;
cli_dbgmsg("pdf_parse_encrypt: Encrypt dictionary in obj %lu %lu\n", objid >> 8, objid & 0xff);
pdf->enc_objid = objid;
}
static void pdf_parse_trailer(struct pdf_struct *pdf, const char *s, long length)
{
const char *enc;
enc = cli_memstr(s, length, "/Encrypt", 8);
if (enc) {
char *newID;
unsigned int newIDlen = 0;
pdf->flags |= 1 << ENCRYPTED_PDF;
pdf_parse_encrypt(pdf, enc, s + length - enc);
newID = pdf_readstring(s, length, "/ID", &newIDlen, NULL, false);
if (newID) {
free(pdf->fileID);
pdf->fileID = newID;
pdf->fileIDlen = newIDlen;
}
}
}
void pdf_parseobj(struct pdf_struct *pdf, struct pdf_obj *obj)
{
/* enough to hold common pdf names, we don't need all the names */
char pdfname[64] = {0};
const char *q2, *q3;
const char *nextobj = NULL, *nextopen = NULL, *nextclose = NULL;
const char *q = NULL;
const char *dict = NULL, *enddict = NULL, *start = NULL;
off_t dict_length = 0, full_dict_length = 0, bytesleft = 0;
size_t i = 0;
unsigned filters = 0, blockopens = 0;
enum objstate objstate = STATE_NONE;
json_object *pdfobj = NULL, *jsonobj = NULL;
if (NULL == pdf || NULL == obj) {
cli_warnmsg("pdf_parseobj: invalid arguments\n");
return;
}
cli_dbgmsg("pdf_parseobj: Parsing object %u %u\n", obj->id >> 8, obj->id & 0xff);
if (obj->objstm) {
if ((size_t)obj->start > obj->objstm->streambuf_len) {
cli_dbgmsg("pdf_parseobj: %u %u obj: obj start (%u) is greater than size of object stream (%zu).\n",
obj->id >> 8, obj->id & 0xff, obj->start, obj->objstm->streambuf_len);
return;
}
q = (const char *)(obj->start + obj->objstm->streambuf);
} else {
if ((size_t)obj->start > pdf->size) {
cli_dbgmsg("pdf_parseobj: %u %u obj: obj start (%u) is greater than size of PDF (%lld).\n",
obj->id >> 8, obj->id & 0xff, obj->start, (long long)pdf->size);
return;
}
q = (const char *)(obj->start + pdf->map);
}
start = q;
if (obj->size <= 0)
return;
if (obj->objstm) {
bytesleft = MIN(obj->size, obj->objstm->streambuf_len - obj->start);
} else {
bytesleft = MIN(obj->size, pdf->size - obj->start);
}
/* For objects that aren't already in an object stream^, check if they contain a stream.
* ^Objects in object streams aren't supposed to contain streams, so we don't check them. */
if (NULL == obj->objstm) {
/* Check if object contains stream */
cl_error_t has_stream;
const char *stream = NULL;
size_t stream_size = 0;
has_stream = find_stream_bounds(
start,
obj->size,
&stream,
&stream_size,
(pdf->enc_method_stream <= ENC_IDENTITY) && (pdf->enc_method_embeddedfile <= ENC_IDENTITY));
if ((CL_SUCCESS == has_stream) ||
(CL_EFORMAT == has_stream)) {
/* Stream found. Store this fact and the stream bounds. */
cli_dbgmsg("pdf_parseobj: %u %u contains stream, size: %zu\n", obj->id >> 8, obj->id & 0xff, stream_size);
obj->flags |= (1 << OBJ_STREAM);
obj->stream = stream;
obj->stream_size = stream_size;
}
}
/* find start of dictionary */
do {
nextobj = pdf_nextobject(q, bytesleft);
bytesleft -= nextobj - q;
if (!nextobj || bytesleft < 0) {
cli_dbgmsg("pdf_parseobj: %u %u obj: no dictionary\n", obj->id >> 8, obj->id & 0xff);
if (!(pdfobj) && pdf->ctx->this_layer_metadata_json != NULL) {
pdfobj = cli_jsonobj(pdf->ctx->this_layer_metadata_json, "PDFStats");
if (!(pdfobj))
return;
}
if (pdfobj) {
if (!(jsonobj))
jsonobj = cli_jsonarray(pdfobj, "ObjectsWithoutDictionaries");
if (jsonobj)
cli_jsonint_array(jsonobj, obj->id >> 8);
}
return;
}
/*
* Opening `<` for object's dictionary may be back 1 character,
* provided q is not at the start of the buffer (it shouldn't be).
*/
if (obj->objstm) {
if (obj->objstm->streambuf == q) {
q3 = memchr(q, '<', nextobj - q);
} else {
q3 = memchr(q - 1, '<', nextobj - q + 1);
}
} else {
if (pdf->map == q) {
q3 = memchr(q, '<', nextobj - q);
} else {
q3 = memchr(q - 1, '<', nextobj - q + 1);
}
}
nextobj++;
bytesleft--;
q = nextobj;
} while (!q3 || q3[1] != '<');
dict = q3 + 2;
q = dict;
blockopens++;
bytesleft = obj->size - (q - start);
enddict = q + bytesleft - 1;
/* find end of dictionary block */
if (bytesleft < 0) {
cli_dbgmsg("pdf_parseobj: %u %u obj: broken dictionary\n", obj->id >> 8, obj->id & 0xff);
if (!(pdfobj) && pdf->ctx->this_layer_metadata_json != NULL) {
pdfobj = cli_jsonobj(pdf->ctx->this_layer_metadata_json, "PDFStats");
if (!(pdfobj))
return;
}
if (pdfobj) {
if (!(jsonobj))
jsonobj = cli_jsonarray(pdfobj, "ObjectsWithBrokenDictionaries");
if (jsonobj)
cli_jsonint_array(jsonobj, obj->id >> 8);
}
return;
}
/* while still looking ... */
while ((q < enddict - 1) && (blockopens > 0)) {
/* find next close */
nextclose = memchr(q, '>', enddict - q);
if (nextclose && (nextclose[1] == '>')) {
/* check for nested open */
while ((nextopen = memchr(q - 1, '<', nextclose - q + 1)) != NULL) {
if (nextopen[1] == '<') {
/* nested open */
blockopens++;
q = nextopen + 2;
} else {
/* unmatched < before next close */
q = nextopen + 2;
}
}
/* close block */
blockopens--;
q = nextclose + 2;
} else if (nextclose) {
/* found one > but not two */
q = nextclose + 2;
} else {
/* next closing not found */
break;
}
}
/* Was end of dictionary found? */
if (blockopens) {
/* probably truncated */
cli_dbgmsg("pdf_parseobj: %u %u obj broken dictionary\n", obj->id >> 8, obj->id & 0xff);
if (!(pdfobj) && pdf->ctx->this_layer_metadata_json != NULL) {
pdfobj = cli_jsonobj(pdf->ctx->this_layer_metadata_json, "PDFStats");
if (!(pdfobj))
return;
}
if (pdfobj) {
if (!(jsonobj))
jsonobj = cli_jsonarray(pdfobj, "ObjectsWithBrokenDictionaries");
if (jsonobj)
cli_jsonint_array(jsonobj, obj->id >> 8);
}
return;
}
enddict = nextclose;
obj->flags |= 1 << OBJ_DICT;
full_dict_length = dict_length = enddict - dict;
/* This code prints the dictionary content.
{
char * dictionary = malloc(dict_length + 1);
if (dictionary) {
for (i = 0; i < dict_length; i++) {
if (dict[i] == '\r')
dictionary[i] = '\n';
else if (isprint(dict[i]) || isspace(dict[i]))
dictionary[i] = dict[i];
else
dictionary[i] = '*';
}
dictionary[dict_length] = '\0';
cli_dbgmsg("pdf_parseobj: dictionary is <<%s>>\n", dictionary);
free(dictionary);
}
}
*/
/* process pdf names */
for (q = dict; dict_length > 0;) {
int escapes = 0, breakout = 0;
q2 = memchr(q, '/', dict_length);
if (!q2)
break;
dict_length -= q2 - q;
q = q2;
/* normalize PDF names */
for (i = 0; dict_length > 0 && (i < sizeof(pdfname) - 1); i++) {
q++;
dict_length--;
if (*q == '#') {
if (cli_hex2str_to(q + 1, pdfname + i, 2) == -1)
break;
q += 2;
dict_length -= 2;
escapes = 1;
continue;
}
switch (*q) {
case ' ':
case '\t':
case '\r':
case '\n':
case '/':
case '>':
case '[':
case ']':
case '<':
case '(':
breakout = 1;
}
if (breakout)
break;
pdfname[i] = *q;
}
pdfname[i] = '\0';
handle_pdfname(pdf, obj, pdfname, escapes, &objstate);
if (objstate == STATE_LINEARIZED) {
long trailer_end, trailer;
pdfobj_flag(pdf, obj, LINEARIZED_PDF);
objstate = STATE_NONE;
trailer_end = pdf_readint(dict, full_dict_length, "/H");
if ((trailer_end > 0) && ((size_t)trailer_end < pdf->size)) {
trailer = trailer_end - 1024;
if (trailer < 0)
trailer = 0;
q2 = pdf->map + trailer;
cli_dbgmsg("pdf_parseobj: looking for trailer in linearized pdf: %ld - %ld\n", trailer, trailer_end);
pdf_parse_trailer(pdf, q2, trailer_end - trailer);
if (pdf->fileID)
cli_dbgmsg("pdf_parseobj: found fileID\n");
}
}
if (objstate == STATE_LAUNCHACTION)
pdfobj_flag(pdf, obj, HAS_LAUNCHACTION);
if (dict_length > 0 && (objstate == STATE_JAVASCRIPT ||
objstate == STATE_OPENACTION ||
objstate == STATE_CONTENTS ||
objstate == STATE_URI)) {
off_t dict_remaining = dict_length;
if (objstate == STATE_OPENACTION)
pdfobj_flag(pdf, obj, HAS_OPENACTION);
q2 = pdf_nextobject(q, dict_remaining);
if (q2 && isdigit(*q2)) {
const char *q2_old = NULL;
unsigned long objid;
unsigned long genid;
long temp_long;
dict_remaining -= (off_t)(q2 - q);
if (CL_SUCCESS != cli_strntol_wrap(q2, (size_t)dict_remaining, 0, 10, &temp_long)) {
cli_dbgmsg("pdf_parseobj: failed to parse object objid\n");
return;
} else if (temp_long < 0) {
cli_dbgmsg("pdf_parseobj: Encountered invalid negative genid (%ld).\n", temp_long);
return;
}
objid = (unsigned long)temp_long;
objid = objid << 8;
while ((dict_remaining > 0) && isdigit(*q2)) {
q2++;
dict_remaining--;
}
q2_old = q2;
q2 = pdf_nextobject(q2, dict_remaining);
if (q2 && isdigit(*q2)) {
dict_remaining -= (off_t)(q2 - q2_old);
if (CL_SUCCESS != cli_strntol_wrap(q2, (size_t)dict_remaining, 0, 10, &temp_long)) {
cli_dbgmsg("pdf_parseobj: failed to parse object genid\n");
return;
} else if (temp_long < 0) {
cli_dbgmsg("pdf_parseobj: Encountered invalid negative genid (%ld).\n", temp_long);
return;
}
genid = (unsigned long)temp_long;
objid |= genid & 0xff;
q2 = pdf_nextobject(q2, dict_remaining);
if (q2 && *q2 == 'R') {
struct pdf_obj *obj2;
cli_dbgmsg("pdf_parseobj: found %s stored in indirect object %lu %lu\n", pdfname, objid >> 8, objid & 0xff);
obj2 = find_obj(pdf, obj, objid);
if (obj2) {
enum pdf_objflags flag = OBJ_STREAM;
switch (objstate) {
case STATE_JAVASCRIPT:
flag = OBJ_JAVASCRIPT;
break;
case STATE_OPENACTION:
flag = OBJ_OPENACTION;
break;
case STATE_CONTENTS:
flag = OBJ_CONTENTS;
break;
case STATE_URI:
flag = OBJ_URI;
break;
default:
cli_dbgmsg("pdf_parseobj: Unexpected object type\n");
return;
}
obj->flags &= ~(1 << flag); /* Disable flag for current object ... */
obj2->flags |= 1 << flag; /* ... and set the flag for the indirect object instead! */
} else {
pdfobj_flag(pdf, obj, BAD_INDOBJ);
}
}
}
}
objstate = STATE_NONE;
}
}
for (i = 0; i < sizeof(pdfname_actions) / sizeof(pdfname_actions[0]); i++) {
const struct pdfname_action *act = &pdfname_actions[i];
if ((obj->flags & (1 << act->set_objflag)) &&
act->from_state == STATE_FILTER &&
act->to_state == STATE_FILTER &&
act->set_objflag != OBJ_FILTER_CRYPT &&
act->set_objflag != OBJ_FILTER_STANDARD) {
filters++;
}
}
if (filters > 2) {
/* more than 2 non-crypt filters */
pdfobj_flag(pdf, obj, MANY_FILTERS);
}
if (obj->flags & ((1 << OBJ_SIGNED) | KNOWN_FILTERS))
obj->flags &= ~(1 << OBJ_FILTER_UNKNOWN);
if (obj->flags & (1 << OBJ_FILTER_UNKNOWN))
pdfobj_flag(pdf, obj, UNKNOWN_FILTER);
cli_dbgmsg("pdf_parseobj: %u %u obj flags: %02x\n", obj->id >> 8, obj->id & 0xff, obj->flags);
}
/**
* @brief Given a pointer to a dictionary object and a key, get the key's value.
*
* @param q0 Offset of the start of the dictionary.
* @param[in,out] len In: The number of bytes in the dictionary.
* Out: The number of bytes remaining from the start
* of the value to the end of the dict
* @param key Null terminated 'key' to search for.
* @return const char* Address of the dictionary key's 'value'.
*/
static const char *pdf_getdict(const char *q0, int *len, const char *key)
{
const char *q;
if (*len <= 0) {
cli_dbgmsg("pdf_getdict: bad length %d\n", *len);
return NULL;
}
if (!q0)
return NULL;
/* find the key */
q = cli_memstr(q0, *len, key, strlen(key));
if (!q) {
cli_dbgmsg("pdf_getdict: %s not found in dict\n", key);
return NULL;
}
*len -= q - q0;
q0 = q;
/* find the start of the value object */
q = pdf_nextobject(q0 + 1, *len - 1);
if (!q) {
cli_dbgmsg("pdf_getdict: %s is invalid in dict\n", key);
return NULL;
}
/* if the value is a dictionary object, include the < > brackets.*/
while (q > q0 && (q[-1] == '<' || q[-1] == '\n'))
q--;
*len -= q - q0;
return q;
}
/**
* @brief Read the value string from a PDF dictionary key/value pair.
*
* @param q0 A pointer into the PDF dictionary.
* @param len The bytes remaining in the file.
* @param key The key we're looking for.
* @param [out] slen The length of the output string
* @param [out] qend The pointer we wound up at, after the end of the value.
* @param noescape Select 'true' to ignore escape characters, 'false' to process them.
* @return char*
*/
static char *pdf_readstring(const char *q0, int len, const char *key, unsigned *slen, const char **qend, bool noescape)
{
char *s, *s0;
const char *start, *q, *end;
if (slen)
*slen = 0;
if (qend)
*qend = q0;
q = pdf_getdict(q0, &len, key);
if (!q || len <= 0)
return NULL;
if (*q == '(') {
int paren = 1;
start = ++q;
len--;
for (; paren > 0 && len > 0; q++, len--) {
switch (*q) {
case '(':
paren++;
break;
case ')':
paren--;
break;
case '\\':
q++;
len--;
break;
default:
break;
}
}
if (len <= 0) {
cli_errmsg("pdf_readstring: Invalid, truncated dictionary.\n");
return NULL;
}
if (qend)
*qend = q;
q--;
len = q - start;
s0 = s = cli_max_malloc(len + 1);
if (!s) {
cli_errmsg("pdf_readstring: Unable to allocate buffer\n");
return NULL;
}
end = start + len;
if (noescape) {
memcpy(s0, start, len);
s = s0 + len;
} else {
for (q = start; q < end; q++) {
if (*q != '\\') {
*s++ = *q;
} else {
q++;
switch (*q) {
case 'n':
*s++ = '\n';
break;
case 'r':
*s++ = '\r';
break;
case 't':
*s++ = '\t';
break;
case 'b':
*s++ = '\b';
break;
case 'f':
*s++ = '\f';
break;
case '(': /* fall-through */
case ')': /* fall-through */
case '\\':
*s++ = *q;
break;
case '\n':
/* ignore */
break;
case '\r':
/* ignore */
if (q + 1 < end && q[1] == '\n')
q++;
break;
case '0':
case '1':
case '2':
case '3':
case '4':
case '5':
case '6':
case '7':
case '8':
case '9':
/* octal escape */
if (q + 2 < end) {
*s++ = 64 * (q[0] - '0') + 8 * (q[1] - '0') + (q[2] - '0');
q += 2;
}
break;
default:
/* ignore */
*s++ = '\\';
q--;
break;
}
}
}
}
*s++ = '\0';
if (slen)
*slen = s - s0 - 1;
return s0;
}
if ((*q == '<') && (len >= 3)) {
start = ++q;
len -= 1;
// skip newlines after <
while (len > 0 && *start == '\n') {
start = ++q;
len -= 1;
}
q = memchr(q + 1, '>', len - 1);
if (!q)
return NULL;
if (qend)
*qend = q;
s = cli_max_malloc((q - start) / 2 + 1);
if (s == NULL) { /* oops, couldn't allocate memory */
cli_dbgmsg("pdf_readstring: unable to allocate memory...\n");
return NULL;
}
if (cli_hex2str_to(start, s, q - start)) {
cli_dbgmsg("pdf_readstring: %s has bad hex value\n", key);
free(s);
return NULL;
}
s[(q - start) / 2] = '\0';
if (slen)
*slen = (q - start) / 2;
return s;
}
cli_dbgmsg("pdf_readstring: %s is invalid string in dict\n", key);
return NULL;
}
static char *pdf_readval(const char *q, int len, const char *key)
{
const char *end;
char *s;
int origlen = len;
q = pdf_getdict(q, &len, key);
if (!q || len <= 0)
return NULL;
while (len > 0 && *q && *q == ' ') {
q++;
len--;
}
if (*q != '/')
return NULL;
q++;
len--;
end = q;
while (len > 0 && *end && !(*end == '/' || (len > 1 && end[0] == '>' && end[1] == '>'))) {
end++;
len--;
}
/* end-of-buffer whitespace trimming */
while (len < origlen && isspace(*(end - 1))) {
end--;
len++;
}
s = cli_max_malloc(end - q + 1);
if (!s)
return NULL;
memcpy(s, q, end - q);
s[end - q] = '\0';
return s;
}
static int pdf_readint(const char *q0, int len, const char *key)
{
long value = 0;
const char *q = pdf_getdict(q0, &len, key);
if (q == NULL) {
value = -1;
} else if (CL_SUCCESS != cli_strntol_wrap(q, (size_t)len, 0, 10, &value)) {
value = -1;
}
return value;
}
static int pdf_readbool(const char *q0, int len, const char *key, int Default)
{
const char *q = pdf_getdict(q0, &len, key);
if (!q || len < 5)
return Default;
if (!strncmp(q, "true", 4))
return 1;
if (!strncmp(q, "false", 5))
return 0;
cli_dbgmsg("pdf_readbool: invalid value for %s bool\n", key);
return Default;
}
static const char *key_padding =
"\x28\xBF\x4E\x5E\x4E\x75\x8A\x41\x64\x00\x4e\x56\xff\xfa\x01\x08"
"\x2e\x2e\x00\xB6\xD0\x68\x3E\x80\x2F\x0C\xA9\xFE\x64\x53\x69\x7A";
static void dbg_printhex(const char *msg, const char *hex, unsigned len)
{
if (cli_debug_flag) {
char *kh = cli_str2hex(hex, len);
cli_dbgmsg("cli_pdf: %s: %s\n", msg, kh);
free(kh);
}
}
/**
* @brief Compute the hash of the password concatenated with the validation salt and (for owner-password checks) the U string.
*
* Some details and comments for how to compute this hash comes from the PyPDF project:
* https://github.com/py-pdf/pypdf/blob/3.17.4/pypdf/_encryption.py#L568
*
* @param password The password to hash.
* @param pwlen The length of the password.
* @param salt The validation salt.
* @param hash The resulting hash.
* @param U [Optional] The U string (for owner-password checks).
*/
static void compute_hash_r6(const char *password, size_t pwlen, const unsigned char salt[16], unsigned char hash[32], const char *U)
{
unsigned char data[(128 + 64 + 48) * 64];
unsigned char block[64];
int32_t block_size = 32;
size_t in_data_len = 0, out_data_len;
int32_t i, j, sum;
uint8_t sha2_256[32], sha2_384[48], sha2_512[64];
/*
* Compute a SHA-256 hash of the UTF-8 password concatenated with the 8 bytes of the owner or user validation salt.
*/
memcpy(data, password, pwlen);
memcpy(data + pwlen, salt, 8);
if (NULL != U) {
// If it's for the owner password check, we also concatenate the 48-byte U string.
memcpy(data + pwlen + 8, U, 48);
cl_sha256(data, pwlen + 8 + 48, block, NULL);
} else {
cl_sha256(data, pwlen + 8, block, NULL);
}
for (i = 0; i < 64 || i < (data[(in_data_len * 64) - 1] + 32); i++) {
memcpy(data, password, pwlen);
memcpy(data + pwlen, block, block_size);
in_data_len = pwlen + block_size;
if (NULL != U) {
// If it's for the owner password check, we also concatenate the 48-byte U string.
memcpy(data + pwlen + block_size, U, 48);
in_data_len += 48;
}
for (j = 1; j < 64; j++)
memcpy(data + j * in_data_len, data, in_data_len);
aes_128cbc_encrypt(data, in_data_len * 64, data, &out_data_len, block, 16, block + 16);
for (j = 0, sum = 0; j < 16; j++)
sum += data[j];
block_size = 32 + (sum % 3) * 16;
switch (block_size) {
case 32:
cl_sha256(data, in_data_len * 64, sha2_256, NULL);
memcpy(block, sha2_256, 32);
break;
case 48:
cl_sha384(data, in_data_len * 64, sha2_384, NULL);
memcpy(block, sha2_384, 48);
break;
case 64:
cl_sha512(data, in_data_len * 64, sha2_512, NULL);
memcpy(block, sha2_512, 64);
break;
}
}
memcpy(hash, block, 32);
}
/**
* @brief Check if the owner password matches an empty password.
*
* Will set the DECRYPTABLE_PDF flag if the owner password is empty.
* Will also set the key and keylen fields in the pdf_struct.
*
* Some details and comments for how to check the owner password comes from the PyPDF project:
* https://github.com/py-pdf/pypdf/blob/3.17.4/pypdf/_encryption.py#L397
*
* @param pdf The PDF context.
* @param R The encryption version.
* @param O The /O string.
* @param U The /U string.
* @param OE The /OE string.
* @param OE_len The length of the /OE string.
*/
static void check_owner_password(struct pdf_struct *pdf, int R,
const char *O, const char *U,
const char *OE, size_t OE_len)
{
bool password_empty = false;
dbg_printhex("U: ", U, 32);
dbg_printhex("O: ", O, 32);
switch (R) {
case 6: {
unsigned char hash[32], validationkey[32];
size_t pwlen = 0;
char password[] = "";
if (NULL == OE) {
cli_dbgmsg("check_owner_password: Missing OE value!\n");
noisy_warnmsg("check_owner_password: Missing OE value!\n");
goto done;
}
dbg_printhex("OE: ", OE, OE_len);
/*
* Test the password against the owner key by computing the SHA-256 hash of the UTF-8 password concatenated
* with the 8 bytes of owner validation salt, concatenated with the 48-byte U string.
*/
compute_hash_r6(
password,
pwlen,
(const unsigned char *)(O + 32), // owner validation salt
validationkey,
U);
/* If the 32-byte result matches the first 32 bytes of the O string, this is the owner password. */
if (0 != memcmp(O, validationkey, sizeof(validationkey))) {
cli_dbgmsg("check_owner_password: Owner password check did not match!\n");
break;
}
/*
* Compute an intermediate owner key by computing the SHA-256 hash of the UTF-8 password concatenated with
* the 8 bytes of owner key salt, concatenated with the 48-byte U string.
*/
compute_hash_r6(
password,
pwlen,
(const unsigned char *)(O + 40), // owner key salt
hash,
U);
if (OE_len != 32) {
cli_dbgmsg("check_owner_password: OE length is not 32: %zu\n", OE_len);
noisy_warnmsg("check_owner_password: OE length is not 32: %zu\n", OE_len);
} else {
pdf->keylen = 32;
pdf->key = cli_max_malloc(pdf->keylen);
if (!pdf->key) {
cli_errmsg("check_owner_password: Cannot allocate memory for pdf->key\n");
goto done;
}
aes_256cbc_decrypt((const unsigned char *)OE, &OE_len, (unsigned char *)(pdf->key), (char *)hash, 32, 0);
dbg_printhex("check_owner_password: Candidate encryption key", pdf->key, pdf->keylen);
password_empty = true;
}
break;
}
default: {
cli_dbgmsg("check_owner_password: Unknown or unsupported encryption version. R: %d\n", R);
noisy_warnmsg("check_owner_password: Unknown or unsupported encryption version. R: %d\n", R);
}
}
if (password_empty) {
/* The key we computed above is the key used to encrypt the streams. We could decrypt it now if we wanted to */
pdf->flags |= 1 << DECRYPTABLE_PDF;
cli_dbgmsg("check_owner_password: encrypted PDF found, owner password is empty, will attempt to decrypt\n");
noisy_msg(pdf, "check_owner_password: encrypted PDF found, owner password is empty, will attempt to decrypt\n");
} else {
/* The key is not valid, we would need the user or the owner password to decrypt */
cli_dbgmsg("check_owner_password: encrypted PDF found but cannot decrypt with empty owner password\n");
noisy_warnmsg("check_owner_password: encrypted PDF found but cannot decrypt with empty owner password\n");
}
done:
return;
}
static void check_user_password(struct pdf_struct *pdf, int R, const char *O,
const char *U, int32_t P, int EM,
const char *UE, size_t UE_len,
unsigned length)
{
unsigned i;
uint8_t result[16];
char data[32];
struct arc4_state arc4;
bool password_empty = false;
dbg_printhex("U: ", U, 32);
dbg_printhex("O: ", O, 32);
switch (R) {
case 2:
case 3:
case 4: {
unsigned char *d;
size_t sz = 68 + pdf->fileIDlen + (R >= 4 && !EM ? 4 : 0);
d = calloc(1, sz);
if (!(d))
goto done;
memcpy(d, key_padding, 32);
memcpy(d + 32, O, 32);
P = le32_to_host(P);
memcpy(d + 64, &P, 4);
memcpy(d + 68, pdf->fileID, pdf->fileIDlen);
/* 7.6.3.3 Algorithm 2 */
/* empty password, password == padding */
if (R >= 4 && !EM) {
uint32_t v = 0xFFFFFFFF;
memcpy(d + 68 + pdf->fileIDlen, &v, 4);
}
cl_hash_data("md5", d, sz, result, NULL);
free(d);
if (length > 128)
length = 128;
if (R >= 3) {
/* Yes, this really is on purpose */
for (i = 0; i < 50; i++)
cl_hash_data("md5", result, length / 8, result, NULL);
}
if (R == 2)
length = 40;
pdf->keylen = length / 8;
pdf->key = cli_max_malloc(pdf->keylen);
if (!pdf->key)
goto done;
memcpy(pdf->key, result, pdf->keylen);
dbg_printhex("md5", (const char *)result, 16);
dbg_printhex("Candidate encryption key", pdf->key, pdf->keylen);
/* 7.6.3.3 Algorithm 6 */
if (R == 2) {
/* 7.6.3.3 Algorithm 4 */
memcpy(data, key_padding, 32);
if (false == arc4_init(&arc4, (const uint8_t *)(pdf->key), pdf->keylen)) {
noisy_warnmsg("check_user_password: failed to init arc4\n");
goto done;
}
arc4_apply(&arc4, (uint8_t *)data, 32);
dbg_printhex("computed U (R2)", data, 32);
if (!memcmp(data, U, 32))
password_empty = true;
} else {
// R is 3 or 4
unsigned len = pdf->keylen;
unsigned char *d;
d = calloc(1, 32 + pdf->fileIDlen);
if (!(d))
goto done;
/* 7.6.3.3 Algorithm 5 */
memcpy(d, key_padding, 32);
memcpy(d + 32, pdf->fileID, pdf->fileIDlen);
cl_hash_data("md5", d, 32 + pdf->fileIDlen, result, NULL);
memcpy(data, pdf->key, len);
if (false == arc4_init(&arc4, (const uint8_t *)data, len)) {
noisy_warnmsg("check_user_password: failed to init arc4\n");
goto done;
}
arc4_apply(&arc4, result, 16);
for (i = 1; i <= 19; i++) {
unsigned j;
for (j = 0; j < len; j++)
data[j] = pdf->key[j] ^ i;
if (false == arc4_init(&arc4, (const uint8_t *)data, len)) {
noisy_warnmsg("check_user_password: failed to init arc4\n");
goto done;
}
arc4_apply(&arc4, result, 16);
}
dbg_printhex("fileID", pdf->fileID, pdf->fileIDlen);
dbg_printhex("computed U (R>=3)", (const char *)result, 16);
if (!memcmp(result, U, 16))
password_empty = true;
free(d);
}
break;
}
case 5: {
uint8_t result2[32];
/* supplement to ISO3200, 3.5.2 Algorithm 3.11 */
/* user validation salt */
cl_sha256(U + 32, 8, result2, NULL);
dbg_printhex("Computed U", (const char *)result2, 32);
if (!memcmp(result2, U, 32)) {
/* Algorithm 3.2a could be used to recover encryption key */
cl_sha256(U + 40, 8, result2, NULL);
if (UE_len != 32) {
cli_dbgmsg("check_user_password: UE length is not 32: %zu\n", UE_len);
noisy_warnmsg("check_user_password: UE length is not 32: %zu\n", UE_len);
} else {
pdf->keylen = 32;
pdf->key = cli_max_malloc(pdf->keylen);
if (!pdf->key) {
cli_errmsg("check_user_password: Cannot allocate memory for pdf->key\n");
goto done;
}
aes_256cbc_decrypt((const unsigned char *)UE, &UE_len, (unsigned char *)(pdf->key), (char *)result2, 32, 0);
dbg_printhex("check_user_password: Candidate encryption key", pdf->key, pdf->keylen);
password_empty = true;
}
}
break;
}
case 6: {
unsigned char hash[32], validationkey[32];
size_t pwlen = 0;
char password[] = "";
if (NULL == UE) {
cli_dbgmsg("check_user_password: Missing UE value!\n");
noisy_warnmsg("check_user_password: Missing UE value!\n");
goto done;
}
dbg_printhex("UE: ", UE, UE_len);
/*
* Test the password against the user key by computing the SHA-256 hash of the UTF-8 password concatenated
* with the 8 bytes of user validation salt.
*/
compute_hash_r6(
password,
pwlen,
(const unsigned char *)(U + 32), // user validation salt
validationkey,
NULL); // no U string for user password check
/* If the 32-byte result matches the first 32 bytes of the U string, this is the user password. */
if (0 != memcmp(U, validationkey, sizeof(validationkey))) {
cli_dbgmsg("check_user_password: User password check did not match!\n");
break;
}
/*
* Compute an intermediate user key by computing the SHA-256 hash of the UTF-8 password concatenated with
* the 8 bytes of user key salt.
*/
compute_hash_r6(
password,
pwlen,
(const unsigned char *)(U + 40), // user key salt
hash,
NULL); // no U string for user password check
if (UE_len != 32) {
cli_dbgmsg("check_user_password: UE length is not 32: %zu\n", UE_len);
noisy_warnmsg("check_user_password: UE length is not 32: %zu\n", UE_len);
} else {
pdf->keylen = 32;
pdf->key = cli_max_malloc(pdf->keylen);
if (!pdf->key) {
cli_errmsg("check_user_password: Cannot allocate memory for pdf->key\n");
goto done;
}
aes_256cbc_decrypt((const unsigned char *)UE, &UE_len, (unsigned char *)(pdf->key), (char *)hash, 32, 0);
dbg_printhex("check_user_password: Candidate encryption key", pdf->key, pdf->keylen);
password_empty = true;
}
break;
}
default: {
/* Supported R is in {2,3,4,5} */
cli_dbgmsg("check_user_password: R value out of range\n");
noisy_warnmsg("check_user_password: R value out of range\n");
}
}
if (password_empty) {
cli_dbgmsg("check_user_password: user password is empty\n");
noisy_msg(pdf, "check_user_password: encrypted PDF found, user password is empty, will attempt to decrypt\n");
/* The key we computed above is the key used to encrypt the streams.
* We could decrypt it now if we wanted to */
pdf->flags |= 1 << DECRYPTABLE_PDF;
} else {
/* the key is not valid, we would need the user or the owner password to decrypt */
cli_dbgmsg("check_user_password: user/owner password would be required for decryption\n");
noisy_warnmsg("check_user_password: encrypted PDF found, user password is NOT empty, cannot decrypt!\n");
}
done:
return;
}
enum enc_method parse_enc_method(const char *dict, unsigned len, const char *key, enum enc_method def)
{
const char *q;
char *CFM = NULL;
enum enc_method ret = ENC_UNKNOWN;
if (!key)
return def;
if (!strcmp(key, "Identity"))
return ENC_IDENTITY;
q = pdf_getdict(dict, (int *)(&len), key);
if (!q)
return def;
CFM = pdf_readval(q, len, "/CFM");
if (CFM) {
cli_dbgmsg("parse_enc_method: %s CFM: %s\n", key, CFM);
if (!strncmp(CFM, "V2", 2))
ret = ENC_V2;
else if (!strncmp(CFM, "AESV2", 5))
ret = ENC_AESV2;
else if (!strncmp(CFM, "AESV3", 5))
ret = ENC_AESV3;
else if (!strncmp(CFM, "None", 4))
ret = ENC_NONE;
free(CFM);
}
return ret;
}
void pdf_handle_enc(struct pdf_struct *pdf)
{
struct pdf_obj *obj;
uint32_t len, n, R, P, length, EM = 1, i, oulen;
char *O = NULL;
char *OE = NULL;
size_t OE_len = 0;
char *U = NULL;
char *UE = NULL;
size_t UE_len = 0;
char *StmF = NULL;
char *StrF = NULL;
char *EFF = NULL;
const char *q, *q2;
if (pdf->enc_objid == ~0u)
return;
if (!pdf->fileID) {
cli_dbgmsg("pdf_handle_enc: no file ID\n");
noisy_warnmsg("pdf_handle_enc: no file ID\n");
return;
}
obj = find_obj(pdf, pdf->objs[0], pdf->enc_objid);
if (!obj) {
cli_dbgmsg("pdf_handle_enc: can't find encryption object %d %d\n", pdf->enc_objid >> 8, pdf->enc_objid & 0xff);
noisy_warnmsg("pdf_handle_enc: can't find encryption object %d %d\n", pdf->enc_objid >> 8, pdf->enc_objid & 0xff);
return;
}
len = obj->size;
if (NULL == obj->objstm) {
q = (const char *)(obj->start + pdf->map);
if (!CLI_ISCONTAINED(pdf->map, pdf->size, q, len)) {
cli_dbgmsg("pdf_handle_enc: encryption object found, but not contained in PDF\n");
noisy_warnmsg("pdf_handle_enc: encryption object found, but not contained in PDF\n");
return;
}
} else {
q = (const char *)(obj->start + obj->objstm->streambuf);
if (!CLI_ISCONTAINED(obj->objstm->streambuf, obj->objstm->streambuf_len, q, len)) {
cli_dbgmsg("pdf_handle_enc: encryption object found, but not contained in PDF streambuf\n");
noisy_warnmsg("pdf_handle_enc: encryption object found, but not contained in PDF streambuf\n");
return;
}
}
O = U = UE = StmF = StrF = EFF = NULL;
pdf->enc_method_string = ENC_UNKNOWN;
pdf->enc_method_stream = ENC_UNKNOWN;
pdf->enc_method_embeddedfile = ENC_UNKNOWN;
q2 = cli_memstr(q, len, "/Standard", 9);
if (!q2) {
cli_dbgmsg("pdf_handle_enc: /Standard not found\n");
noisy_warnmsg("pdf_handle_enc: /Standard not found\n");
goto done;
}
/* we can have both of these:
* /AESV2/Length /Standard/Length
* /Length /Standard
* make sure we don't mistake AES's length for Standard's */
length = pdf_readint(q2, len - (q2 - q), "/Length");
if (length == ~0u)
length = pdf_readint(q, len, "/Length");
if (length < 40) {
cli_dbgmsg("pdf_handle_enc: invalid length: %d\n", length);
length = 40;
}
R = pdf_readint(q, len, "/R");
if (R == ~0u) {
cli_dbgmsg("pdf_handle_enc: invalid R\n");
noisy_warnmsg("pdf_handle_enc: invalid R\n");
goto done;
}
if ((R > 6) || (R < 2)) {
cli_dbgmsg("pdf_handle_enc: R value outside supported range [2..6]\n");
noisy_warnmsg("pdf_handle_enc: R value outside supported range [2..6]\n");
goto done;
}
P = pdf_readint(q, len, "/P");
if (R < 6) { // P field doesn't seem to be required for R6.
if (P == ~0u) {
cli_dbgmsg("pdf_handle_enc: invalid P\n");
noisy_warnmsg("pdf_handle_enc: invalid P\n");
goto done;
}
}
if (R < 5) {
oulen = 32;
} else {
oulen = 48;
}
if (R == 2 || R == 3) {
pdf->enc_method_stream = ENC_V2;
pdf->enc_method_string = ENC_V2;
pdf->enc_method_embeddedfile = ENC_V2;
} else if (R == 4 || R == 5 || R == 6) {
EM = pdf_readbool(q, len, "/EncryptMetadata", 1);
StmF = pdf_readval(q, len, "/StmF");
StrF = pdf_readval(q, len, "/StrF");
EFF = pdf_readval(q, len, "/EFF");
n = len;
pdf->CF = pdf_getdict(q, (int *)(&n), "/CF");
pdf->CF_n = n;
if (StmF) {
cli_dbgmsg("pdf_handle_enc: StmF: %s\n", StmF);
}
if (StrF) {
cli_dbgmsg("pdf_handle_enc: StrF: %s\n", StrF);
}
if (EFF) {
cli_dbgmsg("pdf_handle_enc: EFF: %s\n", EFF);
}
pdf->enc_method_stream = parse_enc_method(pdf->CF, n, StmF, ENC_IDENTITY);
pdf->enc_method_string = parse_enc_method(pdf->CF, n, StrF, ENC_IDENTITY);
pdf->enc_method_embeddedfile = parse_enc_method(pdf->CF, n, EFF, pdf->enc_method_stream);
cli_dbgmsg("pdf_handle_enc: EncryptMetadata: %s\n", EM ? "true" : "false");
if (R == 4) {
length = 128;
} else {
length = 256;
/*
* Read the UE value (for checking user-password)
*/
n = 0;
UE = pdf_readstring(q, len, "/UE", &n, NULL, false);
UE_len = n;
/*
* Read the OE value (for checking owner-password)
*/
n = 0;
OE = pdf_readstring(q, len, "/OE", &n, NULL, false);
OE_len = n;
}
}
if (length == ~0u)
length = 40;
/*
* Read the O value
*/
n = 0;
O = pdf_readstring(q, len, "/O", &n, NULL, false);
if (!O || n < oulen) {
cli_dbgmsg("pdf_handle_enc: invalid O: %d\n", n);
noisy_warnmsg("pdf_handle_enc: invalid O: %d\n", n);
if (O) {
dbg_printhex("invalid O", O, n);
}
goto done;
}
if (n > oulen) {
for (i = oulen; i < n; i++) {
if (O[i]) {
dbg_printhex("pdf_handle_enc: too long O", O, n);
noisy_warnmsg("pdf_handle_enc: too long O: %u", n);
goto done;
}
}
}
/*
* Read the U value
*/
n = 0;
U = pdf_readstring(q, len, "/U", &n, NULL, false);
if (!U || n < oulen) {
cli_dbgmsg("pdf_handle_enc: invalid U: %u\n", n);
noisy_warnmsg("pdf_handle_enc: invalid U: %u\n", n);
if (U) {
dbg_printhex("invalid U", U, n);
}
goto done;
}
if (n > oulen) {
for (i = oulen; i < n; i++) {
if (U[i]) {
dbg_printhex("too long U", U, n);
goto done;
}
}
}
cli_dbgmsg("pdf_handle_enc: Encrypt R: %d, P %x, length: %u\n", R, P, length);
if (length % 8) {
cli_dbgmsg("pdf_handle_enc: wrong key length, not multiple of 8\n");
noisy_warnmsg("pdf_handle_enc: wrong key length, not multiple of 8\n");
goto done;
}
// Check the owner password.
check_owner_password(pdf, R, O, U, OE, OE_len);
if (NULL == pdf->key) {
// Wasn't the owner password, let's try the user password.
check_user_password(pdf, R, O, U, P, EM, UE, UE_len, length);
}
done:
free(O);
free(OE);
free(U);
free(UE);
free(StmF);
free(StrF);
free(EFF);
}
/**
* @brief Search pdf buffer for objects. Parse each.
*
* Newly found objects will be extracted after completion when the extraction for loop continues.
*
* @param pdf Pdf struct that keeps track of all information found in the PDF.
* @param objstm Pointer to an object stream to parse.
*
* @return cl_error_t Error code.
*/
cl_error_t pdf_find_and_parse_objs_in_objstm(struct pdf_struct *pdf, struct objstm_struct *objstm)
{
cl_error_t status = CL_EFORMAT;
cl_error_t retval = CL_EPARSE;
uint32_t badobjects = 0;
size_t i = 0;
struct pdf_obj *obj = NULL;
if ((NULL == objstm) || (NULL == objstm->streambuf)) {
status = CL_EARG;
goto done;
}
if ((0 == objstm->first) ||
(0 == objstm->streambuf_len) ||
(0 == objstm->n)) {
cli_dbgmsg("pdf_find_and_parse_objs_in_objstm: Empty object stream.\n");
goto done;
}
if (objstm->first >= objstm->streambuf_len) {
cli_dbgmsg("pdf_find_and_parse_objs_in_objstm: Invalid objstm values. Offset of first obj greater than stream length.\n");
goto done;
}
/* Process each object */
for (i = 0; i < objstm->n; i++) {
obj = NULL;
if (cli_checktimelimit(pdf->ctx) != CL_SUCCESS) {
cli_dbgmsg("Timeout reached in the PDF parser while parsing object stream.\n");
status = CL_ETIMEOUT;
goto done;
}
/* Find object */
retval = pdf_findobj_in_objstm(pdf, objstm, &obj);
if (retval != CL_SUCCESS) {
if (retval != CL_BREAK) {
cli_dbgmsg("pdf_find_and_parse_objs_in_objstm: Fewer objects in stream than expected: %zu found, %zu expected.\n",
objstm->nobjs_found, objstm->n);
badobjects++;
pdf->stats.ninvalidobjs++;
}
break;
}
cli_dbgmsg("pdf_find_and_parse_objs_in_objstm: Found object %u %u in object stream at offset: %u\n", obj->id >> 8, obj->id & 0xff, obj->start);
if (cli_checktimelimit(pdf->ctx) != CL_SUCCESS) {
cli_dbgmsg("Timeout reached in the PDF parser while parsing object stream.\n");
status = CL_ETIMEOUT;
goto done;
}
/* Parse object */
pdf_parseobj(pdf, obj);
}
if (badobjects) {
status = CL_EFORMAT;
goto done;
}
status = CL_SUCCESS;
done:
return status;
}
/**
* @brief Search pdf buffer for objects. Parse each and then extract each.
*
* @param pdf Pdf struct that keeps track of all information found in the PDF.
*
* @return cl_error_t Error code.
*/
static cl_error_t pdf_find_and_extract_objs(struct pdf_struct *pdf)
{
cl_error_t status = CL_SUCCESS;
int32_t rv = 0;
unsigned int i = 0;
uint32_t badobjects = 0;
cli_ctx *ctx = NULL;
if (NULL == pdf) {
cli_errmsg("pdf_find_and_extract_objs: Invalid arguments.\n");
status = CL_EARG;
goto done;
}
ctx = pdf->ctx;
/* parse PDF and find obj offsets */
while (CL_BREAK != (rv = pdf_findobj(pdf))) {
if (rv == CL_EMEM) {
cli_errmsg("pdf_find_and_extract_objs: Memory allocation error.\n");
status = CL_EMEM;
goto done;
}
}
/* must parse after finding all objs, so we can flag indirect objects */
for (i = 0; i < pdf->nobjs; i++) {
struct pdf_obj *obj = pdf->objs[i];
if (cli_checktimelimit(pdf->ctx) != CL_SUCCESS) {
cli_dbgmsg("pdf_find_and_extract_objs: Timeout reached in the PDF parser while parsing objects.\n");
status = CL_ETIMEOUT;
goto done;
}
pdf_parseobj(pdf, obj);
}
pdf_handle_enc(pdf);
if (pdf->flags & (1 << ENCRYPTED_PDF))
cli_dbgmsg("pdf_find_and_extract_objs: encrypted pdf found, %s!\n",
(pdf->flags & (1 << DECRYPTABLE_PDF)) ? "decryptable" : "not decryptable, stream will probably fail to decompress");
if (SCAN_HEURISTIC_ENCRYPTED_DOC &&
(pdf->flags & (1 << ENCRYPTED_PDF)) &&
!(pdf->flags & (1 << DECRYPTABLE_PDF))) {
/* It is encrypted, and a password/key needs to be supplied to decrypt.
* This doesn't trigger for PDFs that are encrypted but don't need
* a password to decrypt */
status = cli_append_potentially_unwanted(pdf->ctx, "Heuristics.Encrypted.PDF");
}
if (CL_SUCCESS == status) {
status = run_pdf_hooks(pdf, PDF_PHASE_PARSED, -1, NULL);
cli_dbgmsg("pdf_find_and_extract_objs: (parsed hooks) returned %d\n", status);
}
if (CL_SUCCESS == status) {
/* extract PDF objs */
for (i = 0; !status && i < pdf->nobjs; i++) {
struct pdf_obj *obj = pdf->objs[i];
if (cli_checktimelimit(pdf->ctx) != CL_SUCCESS) {
cli_dbgmsg("pdf_find_and_extract_objs: Timeout reached in the PDF parser while extracting objects.\n");
status = CL_ETIMEOUT;
goto done;
}
pdf->parse_recursion_depth++;
status = pdf_extract_obj(pdf, obj, PDF_EXTRACT_OBJ_SCAN);
pdf->parse_recursion_depth--;
switch (status) {
case CL_EFORMAT:
/* Don't halt on one bad object */
cli_dbgmsg("pdf_find_and_extract_objs: Format error when extracting object, skipping to the next object.\n");
badobjects++;
pdf->stats.ninvalidobjs++;
status = CL_CLEAN;
break;
case CL_VIRUS:
break;
default:
break;
}
}
}
done:
if ((CL_SUCCESS == status) && badobjects) {
status = CL_EFORMAT;
}
return status;
}
/**
* @brief Primary function for parsing and scanning a PDF.
*
* @param dir Filepath for temp file.
* @param ctx clam scan context structure.
* @param offset offset of pdf in ctx->fmap
*
* @return int Returns cl_error_t status value.
*/
cl_error_t cli_pdf(const char *dir, cli_ctx *ctx, off_t offset)
{
cl_error_t rc = CL_SUCCESS;
struct pdf_struct pdf;
fmap_t *map = ctx->fmap;
size_t size = map->len - offset;
off_t versize = size > 1032 ? 1032 : size;
off_t map_off, bytesleft;
unsigned long xref;
long temp_long;
const char *pdfver, *tmp, *start, *eofmap, *q, *eof;
unsigned i;
unsigned int objs_found = 0;
json_object *pdfobj = NULL;
char *begin, *end, *p1;
cli_dbgmsg("in cli_pdf(%s)\n", dir);
memset(&pdf, 0, sizeof(pdf));
pdf.ctx = ctx;
pdf.dir = dir;
pdf.enc_objid = ~0u;
pdfver = start = fmap_need_off_once(map, offset, versize);
/* Check PDF version */
if (!pdfver) {
cli_errmsg("cli_pdf: mmap() failed (1)\n");
rc = CL_EMAP;
goto done;
}
if (ctx->this_layer_metadata_json)
pdfobj = cli_jsonobj(ctx->this_layer_metadata_json, "PDFStats");
/* offset is 0 when coming from filetype2 */
tmp = cli_memstr(pdfver, versize, "%PDF-", 5);
if (!tmp) {
cli_dbgmsg("cli_pdf: no PDF- header found\n");
noisy_warnmsg("cli_pdf: no PDF- header found\n");
rc = CL_SUCCESS;
goto done;
}
versize -= tmp - pdfver;
pdfver = tmp;
if (versize < 8) {
rc = CL_EFORMAT;
goto done;
}
/* Check for PDF-1.[0-9]. Although 1.7 is highest now, allow for future versions */
if (pdfver[5] != '1' || pdfver[6] != '.' ||
pdfver[7] < '1' || pdfver[7] > '9') {
pdf.flags |= 1 << BAD_PDF_VERSION;
cli_dbgmsg("cli_pdf: bad pdf version: %.8s\n", pdfver);
if (pdfobj)
cli_jsonbool(pdfobj, "BadVersion", 1);
} else {
if (pdfobj) {
begin = (char *)(pdfver + 5);
end = begin + 2;
strtoul(end, &end, 10);
p1 = cli_max_calloc((end - begin) + 2, 1);
if (p1) {
strncpy(p1, begin, end - begin);
p1[end - begin] = '\0';
cli_jsonstr(pdfobj, "PDFVersion", p1);
free(p1);
}
}
}
if (pdfver != start || offset) {
pdf.flags |= 1 << BAD_PDF_HEADERPOS;
cli_dbgmsg("cli_pdf: PDF header is not at position 0: %lld\n", (long long)(pdfver - start + offset));
if (pdfobj)
cli_jsonbool(pdfobj, "BadVersionLocation", 1);
}
offset += pdfver - start;
/* find trailer and xref, don't fail if not found */
map_off = (off_t)map->len - 2048;
if (map_off < 0)
map_off = 0;
bytesleft = map->len - map_off;
eofmap = fmap_need_off_once(map, map_off, bytesleft);
if (!eofmap) {
cli_errmsg("cli_pdf: mmap() failed (2)\n");
rc = CL_EMAP;
goto done;
}
eof = eofmap + bytesleft;
for (q = &eofmap[bytesleft - 5]; q > eofmap; q--) {
if (memcmp(q, "%%EOF", 5) == 0)
break;
}
if (q <= eofmap) {
pdf.flags |= 1 << BAD_PDF_TRAILER;
cli_dbgmsg("cli_pdf: %%%%EOF not found\n");
if (pdfobj)
cli_jsonbool(pdfobj, "NoEOF", 1);
} else {
const char *t;
/*size = q - eofmap + map_off;*/
q -= 9;
for (; q > eofmap; q--) {
if (memcmp(q, "startxref", 9) == 0)
break;
}
if (q <= eofmap) {
pdf.flags |= 1 << BAD_PDF_TRAILER;
cli_dbgmsg("cli_pdf: startxref not found\n");
if (pdfobj)
cli_jsonbool(pdfobj, "NoXREF", 1);
} else {
for (t = q; t > eofmap; t--) {
if (memcmp(t, "trailer", 7) == 0)
break;
}
pdf_parse_trailer(&pdf, eofmap, eof - eofmap);
q += 9;
while (q < eof && (*q == ' ' || *q == '\n' || *q == '\r')) {
q++;
}
if (CL_SUCCESS != cli_strntol_wrap(q, q - eofmap + map_off, 0, 10, &temp_long)) {
cli_dbgmsg("cli_pdf: failed to parse PDF trailer xref\n");
pdf.flags |= 1 << BAD_PDF_TRAILER;
} else if (temp_long < 0) {
cli_dbgmsg("cli_pdf: Encountered invalid negative PDF trailer xref (%ld).\n", temp_long);
pdf.flags |= 1 << BAD_PDF_TRAILER;
} else {
xref = (unsigned long)temp_long;
bytesleft = map->len - offset - xref;
if (bytesleft > 4096)
bytesleft = 4096;
q = fmap_need_off_once(map, offset + xref, bytesleft);
if (!q || xrefCheck(q, q + bytesleft) == -1) {
cli_dbgmsg("cli_pdf: did not find valid xref\n");
pdf.flags |= 1 << BAD_PDF_TRAILER;
}
}
}
}
size -= offset;
pdf.size = size;
pdf.map = fmap_need_off(map, offset, size);
if (!pdf.map) {
cli_errmsg("cli_pdf: mmap() failed (3)\n");
rc = CL_EMAP;
goto done;
}
pdf.startoff = offset;
rc = run_pdf_hooks(&pdf, PDF_PHASE_PRE, -1, NULL);
if (CL_SUCCESS != rc) {
cli_dbgmsg("cli_pdf: (pre hooks) returning %d\n", rc);
rc = rc == CL_BREAK ? CL_CLEAN : rc;
goto done;
}
/*
* Find and extract all objects in the PDF.
* This methodology adds objects from object streams.
*/
objs_found = pdf.nobjs;
rc = pdf_find_and_extract_objs(&pdf);
if (CL_EMEM == rc) {
cli_dbgmsg("cli_pdf: pdf_find_and_extract_objs had an allocation failure\n");
goto err;
} else if (pdf.nobjs <= objs_found) {
cli_dbgmsg("cli_pdf: pdf_find_and_extract_objs did not find any new objects!\n");
} else {
cli_dbgmsg("cli_pdf: pdf_find_and_extract_objs found %d new objects.\n", pdf.nobjs - objs_found);
}
if (pdf.flags & (1 << ENCRYPTED_PDF))
pdf.flags &= ~((1 << BAD_FLATESTART) | (1 << BAD_STREAMSTART) | (1 << BAD_ASCIIDECODE));
if (pdf.flags && CL_SUCCESS == rc) {
cli_dbgmsg("cli_pdf: flags 0x%02x\n", pdf.flags);
rc = run_pdf_hooks(&pdf, PDF_PHASE_END, -1, NULL);
if (CL_SUCCESS == rc && SCAN_HEURISTICS && (ctx->dconf->other & OTHER_CONF_PDFNAMEOBJ)) {
if (pdf.flags & (1 << ESCAPED_COMMON_PDFNAME)) {
/* for example /Fl#61te#44#65#63#6f#64#65 instead of /FlateDecode */
rc = cli_append_potentially_unwanted(ctx, "Heuristics.PDF.ObfuscatedNameObject");
}
}
#if 0
/* TODO: find both trailers, and /Encrypt settings */
if (pdf.flags & (1 << LINEARIZED_PDF))
pdf.flags &= ~ (1 << BAD_ASCIIDECODE);
if (pdf.flags & (1 << MANY_FILTERS))
pdf.flags &= ~ (1 << BAD_ASCIIDECODE);
if (CL_SUCCESS == rc && (pdf.flags &
((1 << BAD_PDF_TOOMANYOBJS) | (1 << BAD_STREAM_FILTERS) |
(1<<BAD_FLATE) | (1<<BAD_ASCIIDECODE)|
(1<<UNTERMINATED_OBJ_DICT) | (1<<UNKNOWN_FILTER)))) {
rc = CL_EUNPACK;
}
#endif
}
done:
if (CL_SUCCESS == rc && pdf.stats.ninvalidobjs > 0) {
rc = CL_EFORMAT;
}
err:
pdf_export_json(&pdf);
if (pdf.objstms) {
for (i = 0; i < pdf.nobjstms; i++) {
if (pdf.objstms[i]) {
if (pdf.objstms[i]->streambuf) {
free(pdf.objstms[i]->streambuf);
pdf.objstms[i]->streambuf = NULL;
}
free(pdf.objstms[i]);
pdf.objstms[i] = NULL;
}
}
free(pdf.objstms);
pdf.objstms = NULL;
}
if (NULL != pdf.objs) {
for (i = 0; i < pdf.nobjs; i++) {
if (NULL != pdf.objs[i]) {
if (NULL != pdf.objs[i]->path) {
free(pdf.objs[i]->path);
pdf.objs[i]->path = NULL;
}
free(pdf.objs[i]);
pdf.objs[i] = NULL;
}
}
free(pdf.objs);
pdf.objs = NULL;
}
if (pdf.fileID) {
free(pdf.fileID);
pdf.fileID = NULL;
}
if (pdf.key) {
free(pdf.key);
pdf.key = NULL;
}
/* PDF hooks may abort, don't return CL_BREAK to caller! */
rc = (rc == CL_BREAK) ? CL_CLEAN : rc;
cli_dbgmsg("cli_pdf: returning %d\n", rc);
return rc;
}
/**
* @brief Skip the rest of the current line, and find the start of the next line.
*
* @param ptr Current offset into buffer.
* @param len Remaining bytes in buffer.
*
* @return const char* Address of next line, or NULL if no next line in buffer.
*/
static const char *
pdf_nextlinestart(const char *ptr, size_t len)
{
if (!ptr || (0 == len)) {
/* Invalid args */
return NULL;
}
while (strchr("\r\n", *ptr) == NULL) {
if (--len == 0L)
return NULL;
ptr++;
}
while (strchr("\r\n", *ptr) != NULL) {
if (--len == 0L)
return NULL;
ptr++;
}
return ptr;
}
/**
* @brief Return the start of the next PDF object.
*
* This assumes that we're not in a stream.
*
* @param ptr Current offset into buffer.
* @param len Remaining bytes in buffer.
*
* @return const char* Address of next object in the buffer, or NULL if there is none in the buffer.
*/
static const char *
pdf_nextobject(const char *ptr, size_t len)
{
const char *p;
int inobject = 1;
while (len) {
switch (*ptr) {
case '\n':
case '\r':
case '%': /* comment */
p = pdf_nextlinestart(ptr, len);
if (p == NULL)
return NULL;
len -= (size_t)(p - ptr);
ptr = p;
inobject = 0;
break;
case ' ':
case '\t':
case '[': /* Start of an array object */
case '\v':
case '\f':
case '<': /* Start of a dictionary object */
inobject = 0;
ptr++;
len--;
break;
case '/': /* Start of a name object */
return ptr;
case '(': /* start of JS */
return ptr;
default:
if (!inobject) {
/* TODO: parse and return object type */
return ptr;
}
ptr++;
len--;
}
}
return NULL;
}
/* PDF statistics */
static void ASCIIHexDecode_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfname_action *act)
{
UNUSEDPARAM(obj);
UNUSEDPARAM(act);
if (NULL == pdf)
return;
pdf->stats.nasciihexdecode++;
}
static void ASCII85Decode_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfname_action *act)
{
UNUSEDPARAM(obj);
UNUSEDPARAM(act);
if (NULL == pdf)
return;
pdf->stats.nascii85decode++;
}
static void EmbeddedFile_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfname_action *act)
{
UNUSEDPARAM(obj);
UNUSEDPARAM(act);
if (NULL == pdf)
return;
pdf->stats.nembeddedfile++;
}
static void FlateDecode_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfname_action *act)
{
UNUSEDPARAM(obj);
UNUSEDPARAM(act);
if (NULL == pdf)
return;
pdf->stats.nflate++;
}
static void Image_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfname_action *act)
{
UNUSEDPARAM(obj);
UNUSEDPARAM(act);
if (NULL == pdf)
return;
pdf->stats.nimage++;
}
static void LZWDecode_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfname_action *act)
{
UNUSEDPARAM(obj);
UNUSEDPARAM(act);
if (NULL == pdf)
return;
pdf->stats.nlzw++;
}
static void RunLengthDecode_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfname_action *act)
{
UNUSEDPARAM(obj);
UNUSEDPARAM(act);
if (NULL == pdf)
return;
pdf->stats.nrunlengthdecode++;
}
static void CCITTFaxDecode_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfname_action *act)
{
UNUSEDPARAM(obj);
UNUSEDPARAM(act);
if (NULL == pdf)
return;
pdf->stats.nfaxdecode++;
}
static void JBIG2Decode_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfname_action *act)
{
cli_ctx *ctx = NULL;
struct json_object *pdfobj, *jbig2arr;
UNUSEDPARAM(obj);
UNUSEDPARAM(act);
if (NULL == pdf)
return;
ctx = pdf->ctx;
if (!(SCAN_COLLECT_METADATA))
return;
if (!(pdf->ctx->this_layer_metadata_json))
return;
pdfobj = cli_jsonobj(pdf->ctx->this_layer_metadata_json, "PDFStats");
if (!(pdfobj))
return;
jbig2arr = cli_jsonarray(pdfobj, "JBIG2Objects");
if (!(jbig2arr))
return;
cli_jsonint_array(jbig2arr, obj->id >> 8);
pdf->stats.njbig2decode++;
}
static void DCTDecode_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfname_action *act)
{
UNUSEDPARAM(obj);
UNUSEDPARAM(act);
if (NULL == pdf)
return;
pdf->stats.ndctdecode++;
}
static void JPXDecode_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfname_action *act)
{
UNUSEDPARAM(obj);
UNUSEDPARAM(act);
if (NULL == pdf)
return;
pdf->stats.njpxdecode++;
}
static void Crypt_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfname_action *act)
{
UNUSEDPARAM(obj);
UNUSEDPARAM(act);
if (NULL == pdf)
return;
pdf->stats.ncrypt++;
}
static void Standard_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfname_action *act)
{
UNUSEDPARAM(obj);
UNUSEDPARAM(act);
if (NULL == pdf)
return;
pdf->stats.nstandard++;
}
static void Sig_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfname_action *act)
{
UNUSEDPARAM(obj);
UNUSEDPARAM(act);
if (NULL == pdf)
return;
pdf->stats.nsigned++;
}
static void JavaScript_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfname_action *act)
{
UNUSEDPARAM(pdf);
UNUSEDPARAM(obj);
UNUSEDPARAM(act);
/*
* Don't record the pdf->stats or JSON now, we'll look for the actual
* Javascript in the object when we extract it later. This is to prevent
* false positives when objects reference an indirect object which doesn't
* actually have any content.
*/
}
static void OpenAction_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfname_action *act)
{
UNUSEDPARAM(obj);
UNUSEDPARAM(act);
if (NULL == pdf)
return;
pdf->stats.nopenaction++;
}
static void Launch_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfname_action *act)
{
UNUSEDPARAM(obj);
UNUSEDPARAM(act);
if (NULL == pdf)
return;
pdf->stats.nlaunch++;
}
static void Page_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfname_action *act)
{
UNUSEDPARAM(obj);
UNUSEDPARAM(act);
if (NULL == pdf)
return;
pdf->stats.npage++;
}
static void Author_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfname_action *act)
{
cli_ctx *ctx = NULL;
UNUSEDPARAM(act);
if (NULL == pdf)
return;
ctx = pdf->ctx;
if (!(SCAN_COLLECT_METADATA))
return;
if (!(pdf->stats.author)) {
const char *objstart = (obj->objstm) ? (const char *)(obj->start + obj->objstm->streambuf)
: (const char *)(obj->start + pdf->map);
pdf->stats.author = calloc(1, sizeof(struct pdf_stats_entry));
if (!(pdf->stats.author))
return;
pdf->parse_recursion_depth++;
pdf->stats.author->data = pdf_parse_string(pdf, obj, objstart, obj->size, "/Author", NULL, &(pdf->stats.author->meta));
pdf->parse_recursion_depth--;
}
}
static void Creator_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfname_action *act)
{
cli_ctx *ctx = NULL;
UNUSEDPARAM(act);
if (NULL == pdf)
return;
ctx = pdf->ctx;
if (!(SCAN_COLLECT_METADATA))
return;
if (!(pdf->stats.creator)) {
const char *objstart = (obj->objstm) ? (const char *)(obj->start + obj->objstm->streambuf)
: (const char *)(obj->start + pdf->map);
pdf->stats.creator = calloc(1, sizeof(struct pdf_stats_entry));
if (!(pdf->stats.creator))
return;
pdf->parse_recursion_depth++;
pdf->stats.creator->data = pdf_parse_string(pdf, obj, objstart, obj->size, "/Creator", NULL, &(pdf->stats.creator->meta));
pdf->parse_recursion_depth--;
}
}
static void ModificationDate_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfname_action *act)
{
cli_ctx *ctx = NULL;
UNUSEDPARAM(act);
if (NULL == pdf)
return;
ctx = pdf->ctx;
if (!(SCAN_COLLECT_METADATA))
return;
if (!(pdf->stats.modificationdate)) {
const char *objstart = (obj->objstm) ? (const char *)(obj->start + obj->objstm->streambuf)
: (const char *)(obj->start + pdf->map);
pdf->stats.modificationdate = calloc(1, sizeof(struct pdf_stats_entry));
if (!(pdf->stats.modificationdate))
return;
pdf->parse_recursion_depth++;
pdf->stats.modificationdate->data = pdf_parse_string(pdf, obj, objstart, obj->size, "/ModDate", NULL, &(pdf->stats.modificationdate->meta));
pdf->parse_recursion_depth--;
}
}
static void CreationDate_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfname_action *act)
{
cli_ctx *ctx = NULL;
UNUSEDPARAM(act);
if (NULL == pdf)
return;
ctx = pdf->ctx;
if (!(SCAN_COLLECT_METADATA))
return;
if (!(pdf->stats.creationdate)) {
const char *objstart = (obj->objstm) ? (const char *)(obj->start + obj->objstm->streambuf)
: (const char *)(obj->start + pdf->map);
pdf->stats.creationdate = calloc(1, sizeof(struct pdf_stats_entry));
if (!(pdf->stats.creationdate))
return;
pdf->parse_recursion_depth++;
pdf->stats.creationdate->data = pdf_parse_string(pdf, obj, objstart, obj->size, "/CreationDate", NULL, &(pdf->stats.creationdate->meta));
pdf->parse_recursion_depth--;
}
}
static void Producer_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfname_action *act)
{
cli_ctx *ctx = NULL;
UNUSEDPARAM(act);
if (NULL == pdf)
return;
ctx = pdf->ctx;
if (!(SCAN_COLLECT_METADATA))
return;
if (!(pdf->stats.producer)) {
const char *objstart = (obj->objstm) ? (const char *)(obj->start + obj->objstm->streambuf)
: (const char *)(obj->start + pdf->map);
pdf->stats.producer = calloc(1, sizeof(struct pdf_stats_entry));
if (!(pdf->stats.producer))
return;
pdf->parse_recursion_depth++;
pdf->stats.producer->data = pdf_parse_string(pdf, obj, objstart, obj->size, "/Producer", NULL, &(pdf->stats.producer->meta));
pdf->parse_recursion_depth--;
}
}
static void Title_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfname_action *act)
{
cli_ctx *ctx = NULL;
UNUSEDPARAM(act);
if (NULL == pdf)
return;
ctx = pdf->ctx;
if (!(SCAN_COLLECT_METADATA))
return;
if (!(pdf->stats.title)) {
const char *objstart = (obj->objstm) ? (const char *)(obj->start + obj->objstm->streambuf)
: (const char *)(obj->start + pdf->map);
pdf->stats.title = calloc(1, sizeof(struct pdf_stats_entry));
if (!(pdf->stats.title))
return;
pdf->parse_recursion_depth++;
pdf->stats.title->data = pdf_parse_string(pdf, obj, objstart, obj->size, "/Title", NULL, &(pdf->stats.title->meta));
pdf->parse_recursion_depth--;
}
}
static void Keywords_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfname_action *act)
{
cli_ctx *ctx = NULL;
UNUSEDPARAM(act);
if (NULL == pdf)
return;
ctx = pdf->ctx;
if (!(SCAN_COLLECT_METADATA))
return;
if (!(pdf->stats.keywords)) {
const char *objstart = (obj->objstm) ? (const char *)(obj->start + obj->objstm->streambuf)
: (const char *)(obj->start + pdf->map);
pdf->stats.keywords = calloc(1, sizeof(struct pdf_stats_entry));
if (!(pdf->stats.keywords))
return;
pdf->parse_recursion_depth++;
pdf->stats.keywords->data = pdf_parse_string(pdf, obj, objstart, obj->size, "/Keywords", NULL, &(pdf->stats.keywords->meta));
pdf->parse_recursion_depth--;
}
}
static void Subject_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfname_action *act)
{
cli_ctx *ctx = NULL;
UNUSEDPARAM(act);
if (NULL == pdf)
return;
ctx = pdf->ctx;
if (!(SCAN_COLLECT_METADATA))
return;
if (!(pdf->stats.subject)) {
const char *objstart = (obj->objstm) ? (const char *)(obj->start + obj->objstm->streambuf)
: (const char *)(obj->start + pdf->map);
pdf->stats.subject = calloc(1, sizeof(struct pdf_stats_entry));
if (!(pdf->stats.subject))
return;
pdf->parse_recursion_depth++;
pdf->stats.subject->data = pdf_parse_string(pdf, obj, objstart, obj->size, "/Subject", NULL, &(pdf->stats.subject->meta));
pdf->parse_recursion_depth--;
}
}
static void RichMedia_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfname_action *act)
{
UNUSEDPARAM(obj);
UNUSEDPARAM(act);
if (NULL == pdf)
return;
pdf->stats.nrichmedia++;
}
static void AcroForm_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfname_action *act)
{
UNUSEDPARAM(obj);
UNUSEDPARAM(act);
if (NULL == pdf)
return;
pdf->stats.nacroform++;
}
static void XFA_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfname_action *act)
{
UNUSEDPARAM(obj);
UNUSEDPARAM(act);
if (NULL == pdf)
return;
pdf->stats.nxfa++;
}
static void Pages_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfname_action *act)
{
cli_ctx *ctx = NULL;
struct pdf_array *array;
const char *objstart = (obj->objstm) ? (const char *)(obj->start + obj->objstm->streambuf)
: (const char *)(obj->start + pdf->map);
const char *begin;
unsigned long npages = 0, count;
long temp_long;
struct pdf_array_node *node;
json_object *pdfobj;
size_t countsize = 0;
UNUSEDPARAM(act);
if (!(pdf) || !(pdf->ctx->this_layer_metadata_json))
return;
ctx = pdf->ctx;
if (!(SCAN_COLLECT_METADATA))
return;
pdfobj = cli_jsonobj(pdf->ctx->this_layer_metadata_json, "PDFStats");
if (!(pdfobj))
return;
begin = cli_memstr(objstart, obj->size, "/Kids", 5);
if (!(begin))
return;
begin += 5;
pdf->parse_recursion_depth++;
array = pdf_parse_array(pdf, obj, obj->size, (char *)begin, NULL);
pdf->parse_recursion_depth--;
if (!(array)) {
cli_jsonbool(pdfobj, "IncorrectPagesCount", 1);
return;
}
for (node = array->nodes; node != NULL; node = node->next)
if (node->datasz)
if (strchr((char *)(node->data), 'R'))
npages++;
begin = cli_memstr(objstart, obj->size, "/Count", 6);
if (!(begin)) {
cli_jsonbool(pdfobj, "IncorrectPagesCount", 1);
goto cleanup;
}
begin += 6;
while (((size_t)(begin - objstart) < obj->size) && isspace(begin[0]))
begin++;
if ((size_t)(begin - objstart) >= obj->size) {
goto cleanup;
}
countsize = (obj->objstm) ? (size_t)(obj->start + obj->objstm->streambuf + obj->size - begin)
: (size_t)(obj->start + pdf->map + obj->size - begin);
if (CL_SUCCESS != cli_strntol_wrap(begin, countsize, 0, 10, &temp_long)) {
cli_jsonbool(pdfobj, "IncorrectPagesCount", 1);
} else if (temp_long < 0) {
cli_jsonbool(pdfobj, "IncorrectPagesCount", 1);
} else {
count = (unsigned long)temp_long;
if (count != npages) {
cli_jsonbool(pdfobj, "IncorrectPagesCount", 1);
}
}
cleanup:
pdf_free_array(array);
}
static void Colors_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfname_action *act)
{
cli_ctx *ctx = NULL;
json_object *colorsobj, *pdfobj;
unsigned long ncolors;
long temp_long;
char *p1;
const char *objstart = (obj->objstm) ? (const char *)(obj->start + obj->objstm->streambuf)
: (const char *)(obj->start + pdf->map);
UNUSEDPARAM(act);
if (!(pdf) || !(pdf->ctx) || !(pdf->ctx->this_layer_metadata_json))
return;
ctx = pdf->ctx;
if (!(SCAN_COLLECT_METADATA))
return;
p1 = (char *)cli_memstr(objstart, obj->size, "/Colors", 7);
if (!(p1))
return;
p1 += 7;
/* Ensure that we have at least one whitespace character plus at least one number */
if (obj->size - (size_t)(p1 - objstart) < 2)
return;
while (((size_t)(p1 - objstart) < obj->size) && isspace(p1[0]))
p1++;
if ((size_t)(p1 - objstart) == obj->size)
return;
if (CL_SUCCESS != cli_strntol_wrap(p1, (size_t)((p1 - objstart) - obj->size), 0, 10, &temp_long)) {
return;
} else if (temp_long < 0) {
return;
}
ncolors = (unsigned long)temp_long;
/* We only care if the number of colors > 2**24 */
if (ncolors < 1 << 24)
return;
pdfobj = cli_jsonobj(pdf->ctx->this_layer_metadata_json, "PDFStats");
if (!(pdfobj))
return;
colorsobj = cli_jsonarray(pdfobj, "BigColors");
if (!(colorsobj))
return;
cli_jsonint_array(colorsobj, obj->id >> 8);
}
static void URI_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfname_action *act)
{
cli_ctx *ctx = NULL;
off_t bytesleft = 0;
char *uri_start = NULL;
char *uri_heap = NULL;
const char *objstart = NULL;
json_object *uriarr = NULL;
UNUSEDPARAM(act);
if (!(pdf) || !(pdf->ctx) || !(pdf->ctx->this_layer_metadata_json) || !obj) {
return;
}
objstart = (obj->objstm) ? (const char *)(obj->start + obj->objstm->streambuf)
: (const char *)(obj->start + pdf->map);
ctx = pdf->ctx;
if (!(SCAN_COLLECT_METADATA) || !(SCAN_STORE_PDF_URIS)) {
return;
}
if (obj->size == 0) {
return;
}
if (obj->objstm) {
bytesleft = MIN(obj->size, obj->objstm->streambuf_len - obj->start);
} else {
bytesleft = MIN(obj->size, pdf->size - obj->start);
}
// Advance forward to the first '(' character
size_t start = 0;
while (bytesleft > 0 && objstart[start] != '(') {
start++;
bytesleft--;
}
if (bytesleft == 0) {
return;
}
// The first character past '(' is the start of the URI
uri_start = (char *)(objstart + start + 1);
bytesleft--;
// Advance forward to the first ')' character
size_t end = 0;
while (bytesleft > 0 && uri_start[end] != ')') {
end++;
bytesleft--;
}
if (uri_start[end] != ')') {
return;
}
// Create a new string containing only the URI
CLI_MAX_MALLOC_OR_GOTO_DONE(uri_heap, end + 1,
cli_errmsg("cli_pdf: malloc() failed (URI)\n"));
strncpy(uri_heap, uri_start, end);
uri_heap[end] = '\0';
uriarr = cli_jsonarray(pdf->ctx->this_layer_metadata_json, "URIs");
if (!uriarr) {
cli_errmsg("cli_pdf: malloc() failed (URI array)\n");
goto done;
}
cli_jsonstr(uriarr, NULL, uri_heap);
done:
free(uri_heap);
}
static void pdf_free_stats(struct pdf_struct *pdf)
{
if (!pdf) {
return;
}
if ((pdf->stats.author)) {
if (pdf->stats.author->data)
free(pdf->stats.author->data);
free(pdf->stats.author);
pdf->stats.author = NULL;
}
if (pdf->stats.creator) {
if (pdf->stats.creator->data)
free(pdf->stats.creator->data);
free(pdf->stats.creator);
pdf->stats.creator = NULL;
}
if (pdf->stats.producer) {
if (pdf->stats.producer->data)
free(pdf->stats.producer->data);
free(pdf->stats.producer);
pdf->stats.producer = NULL;
}
if (pdf->stats.modificationdate) {
if (pdf->stats.modificationdate->data)
free(pdf->stats.modificationdate->data);
free(pdf->stats.modificationdate);
pdf->stats.modificationdate = NULL;
}
if (pdf->stats.creationdate) {
if (pdf->stats.creationdate->data)
free(pdf->stats.creationdate->data);
free(pdf->stats.creationdate);
pdf->stats.creationdate = NULL;
}
if (pdf->stats.title) {
if (pdf->stats.title->data)
free(pdf->stats.title->data);
free(pdf->stats.title);
pdf->stats.title = NULL;
}
if (pdf->stats.subject) {
if (pdf->stats.subject->data)
free(pdf->stats.subject->data);
free(pdf->stats.subject);
pdf->stats.subject = NULL;
}
if (pdf->stats.keywords) {
if (pdf->stats.keywords->data)
free(pdf->stats.keywords->data);
free(pdf->stats.keywords);
pdf->stats.keywords = NULL;
}
}
static void pdf_export_json(struct pdf_struct *pdf)
{
cli_ctx *ctx = NULL;
json_object *pdfobj;
unsigned long i;
if (NULL == pdf)
return;
if (!(pdf->ctx)) {
goto cleanup;
}
ctx = pdf->ctx;
if (!(SCAN_COLLECT_METADATA) || !(pdf->ctx->this_layer_metadata_json)) {
goto cleanup;
}
pdfobj = cli_jsonobj(pdf->ctx->this_layer_metadata_json, "PDFStats");
if (!(pdfobj)) {
goto cleanup;
}
if (pdf->stats.author) {
if (!pdf->stats.author->meta.success) {
char *out = pdf_finalize_string(pdf, pdf->stats.author->meta.obj, pdf->stats.author->data, pdf->stats.author->meta.length);
if (out) {
free(pdf->stats.author->data);
pdf->stats.author->data = out;
pdf->stats.author->meta.length = strlen(out);
pdf->stats.author->meta.success = 1;
}
}
if (pdf->stats.author->meta.success && cli_isutf8(pdf->stats.author->data, pdf->stats.author->meta.length)) {
cli_jsonstr(pdfobj, "Author", pdf->stats.author->data);
} else if (pdf->stats.author->data && pdf->stats.author->meta.length) {
char *b64 = cl_base64_encode(pdf->stats.author->data, pdf->stats.author->meta.length);
cli_jsonstr(pdfobj, "Author", b64);
cli_jsonbool(pdfobj, "Author_base64", 1);
free(b64);
} else {
cli_jsonstr(pdfobj, "Author", "");
}
}
if (pdf->stats.creator) {
if (!pdf->stats.creator->meta.success) {
char *out = pdf_finalize_string(pdf, pdf->stats.creator->meta.obj, pdf->stats.creator->data, pdf->stats.creator->meta.length);
if (out) {
free(pdf->stats.creator->data);
pdf->stats.creator->data = out;
pdf->stats.creator->meta.length = strlen(out);
pdf->stats.creator->meta.success = 1;
}
}
if (pdf->stats.creator->meta.success && cli_isutf8(pdf->stats.creator->data, pdf->stats.creator->meta.length)) {
cli_jsonstr(pdfobj, "Creator", pdf->stats.creator->data);
} else if (pdf->stats.creator->data && pdf->stats.creator->meta.length) {
char *b64 = cl_base64_encode(pdf->stats.creator->data, pdf->stats.creator->meta.length);
cli_jsonstr(pdfobj, "Creator", b64);
cli_jsonbool(pdfobj, "Creator_base64", 1);
free(b64);
} else {
cli_jsonstr(pdfobj, "Creator", "");
}
}
if (pdf->stats.producer) {
if (!pdf->stats.producer->meta.success) {
char *out = pdf_finalize_string(pdf, pdf->stats.producer->meta.obj, pdf->stats.producer->data, pdf->stats.producer->meta.length);
if (out) {
free(pdf->stats.producer->data);
pdf->stats.producer->data = out;
pdf->stats.producer->meta.length = strlen(out);
pdf->stats.producer->meta.success = 1;
}
}
if (pdf->stats.producer->meta.success && cli_isutf8(pdf->stats.producer->data, pdf->stats.producer->meta.length)) {
cli_jsonstr(pdfobj, "Producer", pdf->stats.producer->data);
} else if (pdf->stats.producer->data && pdf->stats.producer->meta.length) {
char *b64 = cl_base64_encode(pdf->stats.producer->data, pdf->stats.producer->meta.length);
cli_jsonstr(pdfobj, "Producer", b64);
cli_jsonbool(pdfobj, "Producer_base64", 1);
free(b64);
} else {
cli_jsonstr(pdfobj, "Producer", "");
}
}
if (pdf->stats.modificationdate) {
if (!pdf->stats.modificationdate->meta.success) {
char *out = pdf_finalize_string(pdf, pdf->stats.modificationdate->meta.obj, pdf->stats.modificationdate->data, pdf->stats.modificationdate->meta.length);
if (out) {
free(pdf->stats.modificationdate->data);
pdf->stats.modificationdate->data = out;
pdf->stats.modificationdate->meta.length = strlen(out);
pdf->stats.modificationdate->meta.success = 1;
}
}
if (pdf->stats.modificationdate->meta.success && cli_isutf8(pdf->stats.modificationdate->data, pdf->stats.modificationdate->meta.length)) {
cli_jsonstr(pdfobj, "ModificationDate", pdf->stats.modificationdate->data);
} else if (pdf->stats.modificationdate->data && pdf->stats.modificationdate->meta.length) {
char *b64 = cl_base64_encode(pdf->stats.modificationdate->data, pdf->stats.modificationdate->meta.length);
cli_jsonstr(pdfobj, "ModificationDate", b64);
cli_jsonbool(pdfobj, "ModificationDate_base64", 1);
free(b64);
} else {
cli_jsonstr(pdfobj, "ModificationDate", "");
}
}
if (pdf->stats.creationdate) {
if (!pdf->stats.creationdate->meta.success) {
char *out = pdf_finalize_string(pdf, pdf->stats.creationdate->meta.obj, pdf->stats.creationdate->data, pdf->stats.creationdate->meta.length);
if (out) {
free(pdf->stats.creationdate->data);
pdf->stats.creationdate->data = out;
pdf->stats.creationdate->meta.length = strlen(out);
pdf->stats.creationdate->meta.success = 1;
}
}
if (pdf->stats.creationdate->meta.success && cli_isutf8(pdf->stats.creationdate->data, pdf->stats.creationdate->meta.length)) {
cli_jsonstr(pdfobj, "CreationDate", pdf->stats.creationdate->data);
} else if (pdf->stats.creationdate->data && pdf->stats.creationdate->meta.length) {
char *b64 = cl_base64_encode(pdf->stats.creationdate->data, pdf->stats.creationdate->meta.length);
cli_jsonstr(pdfobj, "CreationDate", b64);
cli_jsonbool(pdfobj, "CreationDate_base64", 1);
free(b64);
} else {
cli_jsonstr(pdfobj, "CreationDate", "");
}
}
if (pdf->stats.title) {
if (!pdf->stats.title->meta.success) {
char *out = pdf_finalize_string(pdf, pdf->stats.title->meta.obj, pdf->stats.title->data, pdf->stats.title->meta.length);
if (out) {
free(pdf->stats.title->data);
pdf->stats.title->data = out;
pdf->stats.title->meta.length = strlen(out);
pdf->stats.title->meta.success = 1;
}
}
if (pdf->stats.title->meta.success && cli_isutf8(pdf->stats.title->data, pdf->stats.title->meta.length)) {
cli_jsonstr(pdfobj, "Title", pdf->stats.title->data);
} else if (pdf->stats.title->data && pdf->stats.title->meta.length) {
char *b64 = cl_base64_encode(pdf->stats.title->data, pdf->stats.title->meta.length);
cli_jsonstr(pdfobj, "Title", b64);
cli_jsonbool(pdfobj, "Title_base64", 1);
free(b64);
} else {
cli_jsonstr(pdfobj, "Title", "");
}
}
if (pdf->stats.subject) {
if (!pdf->stats.subject->meta.success) {
char *out = pdf_finalize_string(pdf, pdf->stats.subject->meta.obj, pdf->stats.subject->data, pdf->stats.subject->meta.length);
if (out) {
free(pdf->stats.subject->data);
pdf->stats.subject->data = out;
pdf->stats.subject->meta.length = strlen(out);
pdf->stats.subject->meta.success = 1;
}
}
if (pdf->stats.subject->meta.success && cli_isutf8(pdf->stats.subject->data, pdf->stats.subject->meta.length)) {
cli_jsonstr(pdfobj, "Subject", pdf->stats.subject->data);
} else if (pdf->stats.subject->data && pdf->stats.subject->meta.length) {
char *b64 = cl_base64_encode(pdf->stats.subject->data, pdf->stats.subject->meta.length);
cli_jsonstr(pdfobj, "Subject", b64);
cli_jsonbool(pdfobj, "Subject_base64", 1);
free(b64);
} else {
cli_jsonstr(pdfobj, "Subject", "");
}
}
if (pdf->stats.keywords) {
if (!pdf->stats.keywords->meta.success) {
char *out = pdf_finalize_string(pdf, pdf->stats.keywords->meta.obj, pdf->stats.keywords->data, pdf->stats.keywords->meta.length);
if (out) {
free(pdf->stats.keywords->data);
pdf->stats.keywords->data = out;
pdf->stats.keywords->meta.length = strlen(out);
pdf->stats.keywords->meta.success = 1;
}
}
if (pdf->stats.keywords->meta.success && cli_isutf8(pdf->stats.keywords->data, pdf->stats.keywords->meta.length)) {
cli_jsonstr(pdfobj, "Keywords", pdf->stats.keywords->data);
} else if (pdf->stats.keywords->data && pdf->stats.keywords->meta.length) {
char *b64 = cl_base64_encode(pdf->stats.keywords->data, pdf->stats.keywords->meta.length);
cli_jsonstr(pdfobj, "Keywords", b64);
cli_jsonbool(pdfobj, "Keywords_base64", 1);
free(b64);
} else {
cli_jsonstr(pdfobj, "Keywords", "");
}
}
if (pdf->stats.ninvalidobjs)
cli_jsonint(pdfobj, "InvalidObjectCount", pdf->stats.ninvalidobjs);
if (pdf->stats.njs)
cli_jsonint(pdfobj, "JavaScriptObjectCount", pdf->stats.njs);
if (pdf->stats.nflate)
cli_jsonint(pdfobj, "DeflateObjectCount", pdf->stats.nflate);
if (pdf->stats.nactivex)
cli_jsonint(pdfobj, "ActiveXObjectCount", pdf->stats.nactivex);
if (pdf->stats.nflash)
cli_jsonint(pdfobj, "FlashObjectCount", pdf->stats.nflash);
if (pdf->stats.ncolors)
cli_jsonint(pdfobj, "ColorCount", pdf->stats.ncolors);
if (pdf->stats.nasciihexdecode)
cli_jsonint(pdfobj, "AsciiHexDecodeObjectCount", pdf->stats.nasciihexdecode);
if (pdf->stats.nascii85decode)
cli_jsonint(pdfobj, "Ascii85DecodeObjectCount", pdf->stats.nascii85decode);
if (pdf->stats.nembeddedfile)
cli_jsonint(pdfobj, "EmbeddedFileCount", pdf->stats.nembeddedfile);
if (pdf->stats.nimage)
cli_jsonint(pdfobj, "ImageCount", pdf->stats.nimage);
if (pdf->stats.nlzw)
cli_jsonint(pdfobj, "LZWCount", pdf->stats.nlzw);
if (pdf->stats.nrunlengthdecode)
cli_jsonint(pdfobj, "RunLengthDecodeCount", pdf->stats.nrunlengthdecode);
if (pdf->stats.nfaxdecode)
cli_jsonint(pdfobj, "FaxDecodeCount", pdf->stats.nfaxdecode);
if (pdf->stats.njbig2decode)
cli_jsonint(pdfobj, "JBIG2DecodeCount", pdf->stats.njbig2decode);
if (pdf->stats.ndctdecode)
cli_jsonint(pdfobj, "DCTDecodeCount", pdf->stats.ndctdecode);
if (pdf->stats.njpxdecode)
cli_jsonint(pdfobj, "JPXDecodeCount", pdf->stats.njpxdecode);
if (pdf->stats.ncrypt)
cli_jsonint(pdfobj, "CryptCount", pdf->stats.ncrypt);
if (pdf->stats.nstandard)
cli_jsonint(pdfobj, "StandardCount", pdf->stats.nstandard);
if (pdf->stats.nsigned)
cli_jsonint(pdfobj, "SignedCount", pdf->stats.nsigned);
if (pdf->stats.nopenaction)
cli_jsonint(pdfobj, "OpenActionCount", pdf->stats.nopenaction);
if (pdf->stats.nlaunch)
cli_jsonint(pdfobj, "LaunchCount", pdf->stats.nlaunch);
if (pdf->stats.npage)
cli_jsonint(pdfobj, "PageCount", pdf->stats.npage);
if (pdf->stats.nrichmedia)
cli_jsonint(pdfobj, "RichMediaCount", pdf->stats.nrichmedia);
if (pdf->stats.nacroform)
cli_jsonint(pdfobj, "AcroFormCount", pdf->stats.nacroform);
if (pdf->stats.nxfa)
cli_jsonint(pdfobj, "XFACount", pdf->stats.nxfa);
if (pdf->flags & (1 << BAD_PDF_VERSION))
cli_jsonbool(pdfobj, "BadVersion", 1);
if (pdf->flags & (1 << BAD_PDF_HEADERPOS))
cli_jsonbool(pdfobj, "BadHeaderPosition", 1);
if (pdf->flags & (1 << BAD_PDF_TRAILER))
cli_jsonbool(pdfobj, "BadTrailer", 1);
if (pdf->flags & (1 << BAD_PDF_TOOMANYOBJS))
cli_jsonbool(pdfobj, "TooManyObjects", 1);
if (pdf->flags & (1 << ENCRYPTED_PDF)) {
cli_jsonbool(pdfobj, "Encrypted", 1);
if (pdf->flags & (1 << DECRYPTABLE_PDF))
cli_jsonbool(pdfobj, "Decryptable", 1);
else
cli_jsonbool(pdfobj, "Decryptable", 0);
}
for (i = 0; i < pdf->nobjs; i++) {
if (pdf->objs[i]->flags & (1 << OBJ_TRUNCATED)) {
json_object *truncobj;
truncobj = cli_jsonarray(pdfobj, "TruncatedObjects");
if (!(truncobj))
continue;
cli_jsonint_array(truncobj, pdf->objs[i]->id >> 8);
}
}
cleanup:
pdf_free_stats(pdf);
}