clamav/libclamav/pdf.c

1196 lines
30 KiB
C
Raw Normal View History

2005-04-27 12:45:17 +00:00
/*
* Copyright (C) 2007-2008 Sourcefire, Inc.
*
* Authors: Nigel Horne
2005-04-27 12:45:17 +00:00
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
* published by the Free Software Foundation.
2005-04-27 12:45:17 +00:00
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
* MA 02110-1301, USA.
*
* TODO: Embedded fonts
* TODO: Predictor image handling
2005-04-27 12:45:17 +00:00
*/
2007-02-12 20:47:28 +00:00
static char const rcsid[] = "$Id: pdf.c,v 1.61 2007/02/12 20:46:09 njh Exp $";
2005-04-27 12:45:17 +00:00
#if HAVE_CONFIG_H
#include "clamav-config.h"
#endif
#include <stdio.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <ctype.h>
#include <string.h>
#include <fcntl.h>
#include <stdlib.h>
2005-05-01 11:47:49 +00:00
#include <errno.h>
#ifdef HAVE_LIMITS_H
#include <limits.h>
#endif
#ifdef HAVE_UNISTD_H
#include <unistd.h>
#endif
#include <zlib.h>
#include "clamav.h"
#include "others.h"
2005-12-09 07:39:17 +00:00
#include "pdf.h"
#include "scanners.h"
2009-08-25 01:21:15 +02:00
#include "fmap.h"
#include "str.h"
#ifdef CL_DEBUG
/*#define SAVE_TMP
*Save the file being worked on in tmp */
#endif
static int try_flatedecode(unsigned char *buf, off_t real_len, off_t calculated_len, int fout, cli_ctx *ctx);
static int flatedecode(unsigned char *buf, off_t len, int fout, cli_ctx *ctx);
2007-02-10 14:23:52 +00:00
static int ascii85decode(const char *buf, off_t len, unsigned char *output);
static const char *pdf_nextlinestart(const char *ptr, size_t len);
2005-07-30 10:08:59 +00:00
static const char *pdf_nextobject(const char *ptr, size_t len);
2010-05-10 11:57:44 +03:00
enum pdf_flag {
BAD_PDF_VERSION=0,
BAD_PDF_HEADERPOS,
BAD_PDF_TRAILER,
2010-05-10 23:41:34 +03:00
BAD_PDF_TOOMANYOBJS,
BAD_STREAM_FILTERS,
UNTERMINATED_OBJ_DICT,
ESCAPED_COMMON_PDFNAME,
2010-05-10 11:57:44 +03:00
};
static int xrefCheck(const char *xref, const char *eof)
{
const char *q;
while (xref < eof && *xref == ' ' || *xref == '\n' || *xref == '\r')
xref++;
if (xref + 4 >= eof)
return -1;
if (!memcmp(xref, "xref", 4)) {
cli_dbgmsg("cli_pdf: found xref\n");
return 0;
}
/* could be xref stream */
for (q=xref; q+5 < eof; q++) {
if (!memcmp(q,"/XRef",4)) {
cli_dbgmsg("cli_pdf: found /XRef\n");
return 0;
}
}
return -1;
}
enum objflags {
2010-05-10 23:41:34 +03:00
OBJ_STREAM=0,
OBJ_DICT,
OBJ_FILTER_AH,
OBJ_FILTER_A85,
OBJ_FILTER_FLATE,
OBJ_FILTER_LZW,
OBJ_FILTER_RL,
OBJ_FILTER_FAX,
OBJ_FILTER_JBIG2,
OBJ_FILTER_DCT,
OBJ_FILTER_JPX,
OBJ_FILTER_CRYPT,
OBJ_JAVASCRIPT
2010-05-10 11:57:44 +03:00
};
struct pdf_obj {
uint32_t start;
uint32_t id;
uint32_t flags;
};
struct pdf_struct {
struct pdf_obj *objs;
unsigned nobjs;
const char *map;
off_t size;
off_t offset;
2010-05-10 23:41:34 +03:00
unsigned flags;
2010-05-10 11:57:44 +03:00
};
static const char *findNextNonWSBack(const char *q, const char *start)
{
while (q > start &&
(*q == 0 || *q == 9 || *q == 0xa || *q == 0xc || *q == 0xd || *q == 0x20))
{
q--;
}
return q;
}
2010-05-10 23:41:34 +03:00
static int pdf_findobj(struct pdf_struct *pdf)
2010-05-10 11:57:44 +03:00
{
const char *start, *q, *q2, *eof;
struct pdf_obj *obj;
off_t bytesleft;
unsigned genid, objid;
pdf->nobjs++;
pdf->objs = cli_realloc2(pdf->objs, sizeof(*pdf->objs)*pdf->nobjs);
if (!pdf->objs) {
cli_warnmsg("cli_pdf: out of memory parsing objects (%ld)\n", pdf->nobjs);
return -1;
}
obj = &pdf->objs[pdf->nobjs-1];
start = pdf->map+pdf->offset;
bytesleft = pdf->size - pdf->offset;
q2 = cli_memstr(start, bytesleft, " obj", 4);
if (!q2)
return 0;/* no more objs */
bytesleft -= q2 - start;
q = findNextNonWSBack(q2-1, start);
while (q > start && isdigit(*q)) { q--; }
genid = atoi(q);
q = findNextNonWSBack(q-1,start);
while (q > start && isdigit(*q)) { q--; }
objid = atoi(q);
obj->id = (objid << 8) | (genid&0xff);
obj->start = q2+4 - pdf->map;
obj->flags = 0;
bytesleft -= 4;
eof = pdf->map + pdf->size;
q = pdf->map + obj->start;
while (q < eof && bytesleft > 0) {
q2 = pdf_nextobject(q, bytesleft);
if (!q2)
return 0;/* no more objs */
bytesleft -= q2 - q;
2010-05-10 23:41:34 +03:00
if ((q2 = cli_memstr(q-1, q2-q+1, "stream", 6))) {
2010-05-10 11:57:44 +03:00
obj->flags |= 1 << OBJ_STREAM;
q2 += 6;
bytesleft -= 6;
q2 = cli_memstr(q2, bytesleft, "endstream", 9);
if (!q2)
return 0;/* no more objs */
q2 += 6;
bytesleft -= q2 - q;
2010-05-10 23:41:34 +03:00
} else if ((q2 = cli_memstr(q-1, q2-q+1, "endobj", 6))) {
2010-05-10 11:57:44 +03:00
q2 += 6;
pdf->offset = q2 - pdf->map;
return 1; /* obj found and offset positioned */
} else {
2010-05-10 23:41:34 +03:00
q2++;
2010-05-10 11:57:44 +03:00
}
q = q2;
}
return 0;/* no more objs */
}
2010-05-10 23:41:34 +03:00
static void pdfobj_flag(struct pdf_struct *pdf, struct pdf_obj *obj, enum pdf_flag flag)
{
const char *s;
pdf->flags |= 1 << flag;
if (!cli_debug_flag)
return;
switch (flag) {
case UNTERMINATED_OBJ_DICT:
s = "dictionary not terminated";
break;
case ESCAPED_COMMON_PDFNAME:
/* like /JavaScript */
s = "escaped common pdfname";
break;
case BAD_STREAM_FILTERS:
s = "duplicate stream filters";
break;
case BAD_PDF_VERSION:
case BAD_PDF_HEADERPOS:
case BAD_PDF_TRAILER:
case BAD_PDF_TOOMANYOBJS:
return;
}
cli_dbgmsg("cli_pdf: %s in object %u %u\n", s, obj->id>>8, obj->id&0xff);
}
enum objstate {
STATE_NONE,
STATE_S,
STATE_FILTER,
STATE_ANY /* for actions table below */
};
struct pdfname_action {
const char *pdfname;
enum objflags set_objflag;/* OBJ_DICT is noop */
enum objstate from_state;/* STATE_NONE is noop */
enum objstate to_state;
};
static struct pdfname_action pdfname_actions[] = {
{"ASCIIHexDecode", OBJ_FILTER_AH, STATE_FILTER, STATE_FILTER},
{"ASCII85Decode", OBJ_FILTER_A85, STATE_FILTER, STATE_FILTER},
{"FlateDecode", OBJ_FILTER_FLATE, STATE_FILTER, STATE_FILTER},
{"LZWDecode", OBJ_FILTER_LZW, STATE_FILTER, STATE_FILTER},
{"RunLengthDecode", OBJ_FILTER_RL, STATE_FILTER, STATE_FILTER},
{"CCITTFaxDecode", OBJ_FILTER_FAX, STATE_FILTER, STATE_FILTER},
{"JBIG2Decode", OBJ_FILTER_DCT, STATE_FILTER, STATE_FILTER},
{"DCTDecode", OBJ_FILTER_DCT, STATE_FILTER, STATE_FILTER},
{"JPXDecode", OBJ_FILTER_JPX, STATE_FILTER, STATE_FILTER},
{"Crypt", OBJ_FILTER_CRYPT, STATE_FILTER, STATE_NONE},
{"Filter", OBJ_DICT, STATE_ANY, STATE_FILTER},
{"JavaScript", OBJ_JAVASCRIPT, STATE_S, STATE_NONE},
{"Length", OBJ_DICT, STATE_FILTER, STATE_NONE},
{"S", OBJ_DICT, STATE_NONE, STATE_S},
{"Type", OBJ_DICT, STATE_NONE, STATE_NONE}
};
static void handle_pdfname(struct pdf_struct *pdf, struct pdf_obj *obj,
const char *pdfname, int escapes,
const char *after, enum objstate *state)
{
struct pdfname_action *act = NULL;
unsigned j;
for (j=0;j<sizeof(pdfname_actions)/sizeof(pdfname_actions[0]);j++) {
if (!strncmp(pdfname, pdfname_actions[j].pdfname, strlen(pdfname_actions[j].pdfname))) {
act = &pdfname_actions[j];
break;
}
}
if (!act)
return;
if (escapes) {
/* if a commonly used PDF name is escaped that is certainly
suspicious. */
cli_dbgmsg("cli_pdf: pdfname %s is escaped\n", pdfname);
pdfobj_flag(pdf, obj, ESCAPED_COMMON_PDFNAME);
}
if (act->from_state == *state ||
act->from_state == STATE_ANY) {
*state = act->to_state;
if (*state == STATE_FILTER &&
act->set_objflag !=OBJ_DICT &&
(obj->flags & (1 << act->set_objflag))) {
pdfobj_flag(pdf, obj, BAD_STREAM_FILTERS);
}
obj->flags |= 1 << act->set_objflag;
} else {
//auto-reset states
switch (*state) {
case STATE_S:
*state = STATE_NONE;
break;
}
}
}
static void pdf_parseobj(struct pdf_struct *pdf, struct pdf_obj *obj)
{
/* enough to hold common pdf names, we don't need all the names */
char pdfname[64];
const char *q2, *q3;
const char *q = obj->start + pdf->map;
const char *dict, *start;
off_t dict_length;
off_t bytesleft = pdf->offset - obj->start;
unsigned i;
enum objstate objstate = STATE_NONE;
if (bytesleft < 0)
return;
start = q;
/* find start of dictionary */
do {
q2 = pdf_nextobject(q, bytesleft);
bytesleft -= q2 -q;
if (!q2 || bytesleft < 0) {
return;
}
q3 = memchr(q-1, '<', q2-q+1);
q2++;
bytesleft--;
q = q2;
} while (!q3 || q3[1] != '<');
dict = q3+2;
q = dict;
bytesleft = pdf->offset - obj->start - (q3 - start);
/* find end of dictionary */
do {
q2 = pdf_nextobject(q, bytesleft);
bytesleft -= q2 -q;
if (!q2 || bytesleft < 0) {
return;
}
q3 = memchr(q-1, '>', q2-q+1);
q2++;
bytesleft--;
q = q2;
} while (!q3 || q3[1] != '>');
obj->flags |= 1 << OBJ_DICT;
dict_length = q3 - dict;
// process pdf names
for (q = dict;dict_length;) {
int escapes = 0;
q2 = memchr(q, '/', dict_length);
if (!q2)
break;
dict_length -= q2 - q;
// normalize PDF names
for (i = 0;dict_length && (i < sizeof(pdfname)-1); i++) {
q++;
dict_length--;
if (*q == '#') {
cli_hex2str_to(q+1, pdfname+i, 2);
q += 2;
dict_length -= 2;
escapes = 1;
continue;
}
if (*q == ' ' || *q == '\r' || *q == '\n')
break;
pdfname[i] = *q;
}
pdfname[i] = '\0';
handle_pdfname(pdf, obj, pdfname, escapes, q, &objstate);
}
cli_dbgmsg("cli_pdf: %u %u obj flags: %02x\n", obj->id>>8, obj->id&0xff, obj->flags);
}
2010-05-10 11:57:44 +03:00
int cli_pdf(const char *dir, cli_ctx *ctx, off_t offset)
{
struct pdf_struct pdf;
fmap_t *map = *ctx->fmap;
size_t size = map->len - offset;
off_t versize = size > 1032 ? 1032 : size;
off_t map_off, bytesleft;
long xref;
const char *pdfver, *start, *eofmap, *q, *eof;
int rc;
cli_dbgmsg("in cli_pdf(%s)\n", dir);
memset(&pdf, 0, sizeof(pdf));
pdfver = start = fmap_need_off_once(map, offset, versize);
/* Check PDF version */
if (!pdfver) {
cli_errmsg("cli_pdf: mmap() failed\n");
return CL_EMAP;
}
/* offset is 0 when coming from filetype2 */
pdfver = cli_memstr(pdfver, versize, "%PDF-", 5);
if (!pdfver) {
cli_dbgmsg("cli_pdf: no PDF- header found\n");
return CL_SUCCESS;
}
/* Check for PDF-1.[0-9]. Although 1.7 is highest now, allow for future
* versions */
if (pdfver[5] != '1' || pdfver[6] != '.' ||
pdfver[7] < '1' || pdfver[7] > '9') {
2010-05-10 23:41:34 +03:00
pdf.flags |= 1 << BAD_PDF_VERSION;
2010-05-10 11:57:44 +03:00
cli_dbgmsg("cli_pdf: bad pdf version: %.8s\n", pdfver);
}
if (pdfver != start || offset) {
2010-05-10 23:41:34 +03:00
pdf.flags |= 1 << BAD_PDF_HEADERPOS;
2010-05-10 11:57:44 +03:00
cli_dbgmsg("cli_pdf: PDF header is not at position 0: %d\n",pdfver-start+offset);
}
offset += pdfver - start;
/* find trailer and xref, don't fail if not found */
map_off = map->len - 2048;
if (map_off < 0)
map_off = 0;
bytesleft = map->len - map_off;
eofmap = fmap_need_off_once(map, map_off, bytesleft);
if (!eofmap) {
cli_errmsg("cli_pdf: mmap() failed\n");
return CL_EMAP;
}
eof = eofmap + bytesleft;
for (q=&eofmap[bytesleft-5]; q > eofmap; q--) {
if (memcmp(q, "%%EOF", 5) == 0)
break;
}
if (q <= eofmap) {
2010-05-10 23:41:34 +03:00
pdf.flags |= 1 << BAD_PDF_TRAILER;
2010-05-10 11:57:44 +03:00
cli_dbgmsg("cli_pdf: %%%%EOF not found\n");
} else {
size = q - eofmap + map_off;
for (;q > eofmap;q--) {
if (memcmp(q, "startxref", 9) == 0)
break;
}
if (q <= eofmap) {
2010-05-10 23:41:34 +03:00
pdf.flags |= 1 << BAD_PDF_TRAILER;
cli_dbgmsg("cli_pdf: startxref not found\n");
2010-05-10 11:57:44 +03:00
}
q += 9;
while (q < eof && (*q == ' ' || *q == '\n' || *q == '\r')) { q++; }
xref = atol(q);
bytesleft = map->len - offset - xref;
if (bytesleft > 4096)
bytesleft = 4096;
q = fmap_need_off_once(map, offset + xref, bytesleft);
if (!q || xrefCheck(q, q+bytesleft) == -1) {
cli_dbgmsg("cli_pdf: did not find valid xref\n");
2010-05-10 23:41:34 +03:00
pdf.flags |= 1 << BAD_PDF_TRAILER;
2010-05-10 11:57:44 +03:00
}
}
pdf.size = size;
pdf.map = fmap_need_off_once(map, offset, size);
if (!pdf.map) {
cli_errmsg("cli_pdf: mmap() failed\n");
return CL_EMAP;
}
2010-05-10 23:41:34 +03:00
while ((rc = pdf_findobj(&pdf)) > 0) {
2010-05-10 11:57:44 +03:00
struct pdf_obj *obj = &pdf.objs[pdf.nobjs-1];
cli_dbgmsg("found %d %d obj @%ld\n", obj->id >> 8, obj->id&0xff, obj->start + offset);
2010-05-10 23:41:34 +03:00
pdf_parseobj(&pdf, obj);
2010-05-10 11:57:44 +03:00
}
if (rc == -1)
2010-05-10 23:41:34 +03:00
pdf.flags |= 1 << BAD_PDF_TOOMANYOBJS;
2010-05-10 11:57:44 +03:00
2010-05-10 23:41:34 +03:00
if (pdf.flags)
cli_dbgmsg("cli_pdf: flags 0x%02x\n", pdf.flags);
2010-05-10 11:57:44 +03:00
return CL_SUCCESS;
}
2005-04-27 12:45:17 +00:00
int
2010-05-10 11:57:44 +03:00
cli_pdfold(const char *dir, cli_ctx *ctx, off_t offset)
2005-04-27 12:45:17 +00:00
{
2005-05-25 13:37:33 +00:00
off_t size; /* total number of bytes in the file */
2007-09-06 12:22:16 +00:00
off_t bytesleft, trailerlength;
char *buf; /* start of memory mapped area */
const char *p, *q, *trailerstart;
2005-05-25 13:37:33 +00:00
const char *xrefstart; /* cross reference table */
/*size_t xreflength;*/
int printed_predictor_message, printed_embedded_font_message, rc;
2007-10-30 16:31:51 +00:00
unsigned int files;
2009-10-02 18:09:31 +02:00
fmap_t *map = *ctx->fmap;
int opt_failed = 0;
cli_dbgmsg("in cli_pdf(%s)\n", dir);
2009-08-31 05:37:43 +02:00
size = map->len - offset;
if(size <= 7) /* doesn't even include the file header */
return CL_CLEAN;
2009-08-31 06:16:12 +02:00
p = buf = fmap_need_off_once(map, 0, size); /* FIXME: really port to fmap */
2009-08-25 01:21:15 +02:00
if(!buf) {
cli_errmsg("cli_pdf: mmap() failed\n");
return CL_EMAP;
}
2007-02-12 20:47:28 +00:00
cli_dbgmsg("cli_pdf: scanning %lu bytes\n", (unsigned long)size);
/* Lines are terminated by \r, \n or both */
/* File Header */
bytesleft = size - 5;
for(q = p; bytesleft; bytesleft--, q++) {
if(!strncasecmp(q, "%PDF-", 5)) {
bytesleft = size - (off_t) (q - p);
p = q;
break;
}
}
if(!bytesleft) {
cli_dbgmsg("cli_pdf: file header not found\n");
return CL_CLEAN;
}
/* Find the file trailer */
for(q = &p[bytesleft - 5]; q > p; --q)
if(strncasecmp(q, "%%EOF", 5) == 0)
break;
if(q <= p) {
cli_dbgmsg("cli_pdf: trailer not found\n");
return CL_CLEAN;
}
for(trailerstart = &q[-7]; trailerstart > p; --trailerstart)
if(memcmp(trailerstart, "trailer", 7) == 0)
break;
/*
* q points to the end of the trailer section
*/
trailerlength = (long)(q - trailerstart);
if(cli_memstr(trailerstart, trailerlength, "Encrypt", 7)) {
/*
* This tends to mean that the file is, in effect, read-only
* http://www.cs.cmu.edu/~dst/Adobe/Gallery/anon21jul01-pdf-encryption.txt
* http://www.adobe.com/devnet/pdf/
*/
cli_dbgmsg("cli_pdf: Encrypted PDF files not yet supported\n");
return CL_CLEAN;
}
2005-07-30 10:08:59 +00:00
/*
* not true, since edits may put data after the trailer
bytesleft -= trailerlength;
2005-07-30 10:08:59 +00:00
*/
/*
* FIXME: Handle more than one xref section in the xref table
*/
2005-05-25 13:37:33 +00:00
for(xrefstart = trailerstart; xrefstart > p; --xrefstart)
if(memcmp(xrefstart, "xref", 4) == 0)
/*
* Make sure it's the start of the line, not a startxref
* token
*/
if((xrefstart[-1] == '\n') || (xrefstart[-1] == '\r'))
break;
2005-05-25 13:37:33 +00:00
if(xrefstart == p) {
cli_dbgmsg("cli_pdf: xref not found\n");
return CL_CLEAN;
2005-05-25 13:37:33 +00:00
}
printed_predictor_message = printed_embedded_font_message = 0;
2005-07-30 10:08:59 +00:00
/*
* not true, since edits may put data after the trailer
xreflength = (size_t)(trailerstart - xrefstart);
2005-05-25 13:37:33 +00:00
bytesleft -= xreflength;
2005-07-30 10:08:59 +00:00
*/
2005-05-25 13:37:33 +00:00
2007-10-30 16:31:51 +00:00
files = 0;
rc = CL_CLEAN;
2005-05-25 13:37:33 +00:00
/*
2005-07-30 10:08:59 +00:00
* The body section consists of a sequence of indirect objects
2005-05-25 13:37:33 +00:00
*/
2008-02-12 00:58:49 +00:00
while((p < xrefstart) && (cli_checklimits("cli_pdf", ctx, 0, 0, 0)==CL_CLEAN) &&
((q = pdf_nextobject(p, bytesleft)) != NULL)) {
int is_ascii85decode, is_flatedecode, fout, len, has_cr;
2005-12-17 16:46:11 +00:00
/*int object_number, generation_number;*/
2005-05-25 13:37:33 +00:00
const char *objstart, *objend, *streamstart, *streamend;
unsigned long length, objlen, real_streamlen, calculated_streamlen;
int is_embedded_font, predictor;
char fullname[NAME_MAX + 1];
rc = CL_CLEAN;
2005-07-30 10:08:59 +00:00
if(q == xrefstart)
break;
if(memcmp(q, "xref", 4) == 0)
break;
/*object_number = atoi(q);*/
2007-09-06 12:22:16 +00:00
bytesleft -= (off_t)(q - p);
p = q;
if(memcmp(q, "endobj", 6) == 0)
continue;
2005-07-30 10:08:59 +00:00
if(!isdigit(*q)) {
cli_dbgmsg("cli_pdf: Object number missing\n");
2005-07-30 10:08:59 +00:00
break;
}
q = pdf_nextobject(p, bytesleft);
if((q == NULL) || !isdigit(*q)) {
cli_dbgmsg("cli_pdf: Generation number missing\n");
2005-07-30 10:08:59 +00:00
break;
}
/*generation_number = atoi(q);*/
2007-09-06 12:22:16 +00:00
bytesleft -= (off_t)(q - p);
2005-07-30 10:08:59 +00:00
p = q;
q = pdf_nextobject(p, bytesleft);
if((q == NULL) || (memcmp(q, "obj", 3) != 0)) {
cli_dbgmsg("cli_pdf: Indirect object missing \"obj\"\n");
2005-07-30 10:08:59 +00:00
break;
}
2007-09-06 12:22:16 +00:00
bytesleft -= (off_t)((q - p) + 3);
2005-07-30 10:08:59 +00:00
objstart = p = &q[3];
objend = cli_memstr(p, bytesleft, "endobj", 6);
2005-05-25 13:37:33 +00:00
if(objend == NULL) {
cli_dbgmsg("cli_pdf: No matching endobj\n");
break;
}
2007-09-06 12:22:16 +00:00
bytesleft -= (off_t)((objend - p) + 6);
2005-05-25 13:37:33 +00:00
p = &objend[6];
objlen = (unsigned long)(objend - objstart);
2005-05-25 13:37:33 +00:00
/* Is this object a stream? */
streamstart = cli_memstr(objstart, objlen, "stream", 6);
if(streamstart == NULL)
continue;
is_embedded_font = length = is_ascii85decode =
is_flatedecode = 0;
predictor = 1;
/*
* TODO: handle F and FFilter?
*/
q = objstart;
while(q < streamstart) {
2005-07-30 10:08:59 +00:00
if(*q == '/') { /* name object */
/*cli_dbgmsg("Name object %8.8s\n", q+1, q+1);*/
2005-05-25 13:37:33 +00:00
if(strncmp(++q, "Length ", 7) == 0) {
q += 7;
length = atoi(q);
while(isdigit(*q))
q++;
/*
* Note: incremental updates are not
* supported
*/
if((bytesleft > 11) && strncmp(q, " 0 R", 4) == 0) {
const char *r, *nq;
2007-04-16 11:52:03 +00:00
char b[14];
q += 4;
cli_dbgmsg("cli_pdf: Length is in indirect obj %lu\n",
length);
snprintf(b, sizeof(b),
"%lu 0 obj", length);
length = (unsigned long)strlen(b);
/* optimization: assume objects
* are sequential */
if(!opt_failed) {
nq = q;
len = buf + size - q;
} else {
nq = buf;
len = q - buf;
}
do {
r = cli_memstr(nq, len, b, length);
if (r > nq) {
const char x = *(r-1);
if (x == '\n' || x=='\r') {
--r;
break;
}
}
if (r) {
len -= r + length - nq;
nq = r + length;
} else if (!opt_failed) {
/* we failed optimized match,
* try matching from the beginning
*/
len = q - buf;
r = nq = buf;
/* prevent
* infloop */
opt_failed = 1;
}
} while (r);
if(r) {
r += length - 1;
r = pdf_nextobject(r, bytesleft - (r - q));
if(r) {
length = atoi(r);
while(isdigit(*r))
r++;
cli_dbgmsg("cli_pdf: length in '%s' %lu\n",
2007-04-16 11:52:03 +00:00
&b[1],
length);
}
} else
cli_dbgmsg("cli_pdf: Couldn't find '%s'\n",
2007-04-16 11:52:03 +00:00
&b[1]);
}
2005-05-25 13:37:33 +00:00
q--;
} else if(strncmp(q, "Length2 ", 8) == 0)
is_embedded_font = 1;
else if(strncmp(q, "Predictor ", 10) == 0) {
q += 10;
predictor = atoi(q);
while(isdigit(*q))
q++;
q--;
2005-05-25 13:37:33 +00:00
} else if(strncmp(q, "FlateDecode", 11) == 0) {
is_flatedecode = 1;
q += 11;
} else if(strncmp(q, "ASCII85Decode", 13) == 0) {
is_ascii85decode = 1;
2005-05-25 13:37:33 +00:00
q += 13;
}
}
2005-07-30 10:08:59 +00:00
q = pdf_nextobject(q, (size_t)(streamstart - q));
if(q == NULL)
break;
}
if(is_embedded_font) {
/*
* Need some documentation, the only I can find a
2007-03-01 11:10:01 +00:00
* reference to is not free, if some kind soul wishes
* to donate a copy, please contact me!
* (http://safari.adobepress.com/0321304748)
*/
if(!printed_embedded_font_message) {
cli_dbgmsg("cli_pdf: Embedded fonts not yet supported\n");
printed_embedded_font_message = 1;
}
continue;
}
if(predictor > 1) {
/*
* Needs some thought
*/
if(!printed_predictor_message) {
cli_dbgmsg("cli_pdf: Predictor %d not honoured for embedded image\n",
predictor);
printed_predictor_message = 1;
}
continue;
}
2005-05-25 13:37:33 +00:00
/* objend points to the end of the object (start of "endobj") */
streamstart += 6; /* go past the word "stream" */
len = (int)(objend - streamstart);
q = pdf_nextlinestart(streamstart, len);
if(q == NULL)
break;
2005-05-25 13:37:33 +00:00
len -= (int)(q - streamstart);
streamstart = q;
streamend = cli_memstr(streamstart, len, "endstream\n", 10);
2005-05-25 13:37:33 +00:00
if(streamend == NULL) {
streamend = cli_memstr(streamstart, len, "endstream\r", 10);
2005-05-25 13:37:33 +00:00
if(streamend == NULL) {
cli_dbgmsg("cli_pdf: No endstream\n");
break;
}
has_cr = 1;
2007-03-30 21:18:37 +00:00
} else
has_cr = 0;
2009-09-24 19:07:39 +02:00
snprintf(fullname, sizeof(fullname), "%s"PATHSEP"pdf%02u", dir, files);
2008-02-14 15:24:07 +00:00
fout = open(fullname, O_RDWR|O_CREAT|O_EXCL|O_TRUNC|O_BINARY, 0600);
if(fout < 0) {
char err[128];
cli_errmsg("cli_pdf: can't create temporary file %s: %s\n", fullname, cli_strerror(errno, err, sizeof(err)));
rc = CL_ETMPFILE;
break;
}
/*
* Calculate the length ourself, the Length parameter is often
* wrong
*/
if((*--streamend != '\n') && (*streamend != '\r'))
2007-03-09 22:39:06 +00:00
streamend++;
else if(has_cr && (*--streamend != '\r'))
2007-03-09 22:39:06 +00:00
streamend++;
if(streamend <= streamstart) {
close(fout);
cli_dbgmsg("cli_pdf: Empty stream\n");
if (cli_unlink(fullname)) {
rc = CL_EUNLINK;
break;
}
continue;
}
calculated_streamlen = (int)(streamend - streamstart);
real_streamlen = length;
cli_dbgmsg("cli_pdf: length %lu, calculated_streamlen %lu isFlate %d isASCII85 %d\n",
length, calculated_streamlen,
is_flatedecode, is_ascii85decode);
if(calculated_streamlen != real_streamlen) {
cli_dbgmsg("cli_pdf: Incorrect Length field in file attempting to recover\n");
if(real_streamlen > calculated_streamlen)
real_streamlen = calculated_streamlen;
}
#if 0
/* FIXME: this isn't right... */
if(length)
/*streamlen = (is_flatedecode) ? length : MIN(length, streamlen);*/
streamlen = MIN(length, streamlen);
#endif
if(is_ascii85decode) {
unsigned char *tmpbuf;
2008-02-12 00:58:49 +00:00
int ret = cli_checklimits("cli_pdf", ctx, calculated_streamlen * 5, calculated_streamlen, real_streamlen);
2008-02-11 20:19:20 +00:00
if(ret != CL_CLEAN) {
close(fout);
if (cli_unlink(fullname)) {
rc = CL_EUNLINK;
break;
}
2008-02-11 20:19:20 +00:00
continue;
}
tmpbuf = cli_malloc(calculated_streamlen * 5);
if(tmpbuf == NULL) {
close(fout);
if (cli_unlink(fullname)) {
rc = CL_EUNLINK;
break;
}
continue;
}
ret = ascii85decode(streamstart, calculated_streamlen, tmpbuf);
if(ret == -1) {
free(tmpbuf);
close(fout);
if (cli_unlink(fullname)) {
rc = CL_EUNLINK;
break;
}
continue;
}
if(ret) {
unsigned char *t;
real_streamlen = ret;
/* free unused trailing bytes */
t = (unsigned char *)cli_realloc(tmpbuf,calculated_streamlen);
if(t == NULL) {
free(tmpbuf);
close(fout);
if (cli_unlink(fullname)) {
rc = CL_EUNLINK;
break;
}
continue;
}
tmpbuf = t;
/*
* Note that it will probably be both
* ascii85encoded and flateencoded
*/
2008-02-11 20:19:20 +00:00
if(is_flatedecode)
rc = try_flatedecode((unsigned char *)tmpbuf, real_streamlen, real_streamlen, fout, ctx);
else
rc = (unsigned long)cli_writen(fout, (const char *)streamstart, real_streamlen)==real_streamlen ? CL_CLEAN : CL_EWRITE;
}
free(tmpbuf);
2008-02-11 20:19:20 +00:00
} else if(is_flatedecode) {
rc = try_flatedecode((unsigned char *)streamstart, real_streamlen, calculated_streamlen, fout, ctx);
2008-02-11 20:19:20 +00:00
} else {
2007-02-12 20:47:28 +00:00
cli_dbgmsg("cli_pdf: writing %lu bytes from the stream\n",
(unsigned long)real_streamlen);
2008-02-12 00:58:49 +00:00
if((rc = cli_checklimits("cli_pdf", ctx, real_streamlen, 0, 0))==CL_CLEAN)
rc = (unsigned long)cli_writen(fout, (const char *)streamstart, real_streamlen) == real_streamlen ? CL_CLEAN : CL_EWRITE;
}
2008-02-12 00:58:49 +00:00
if (rc == CL_CLEAN) {
cli_dbgmsg("cli_pdf: extracted file %u to %s\n", files, fullname);
files++;
2008-02-12 00:58:49 +00:00
lseek(fout, 0, SEEK_SET);
rc = cli_magic_scandesc(fout, ctx);
}
close(fout);
if(!ctx->engine->keeptmp)
if (cli_unlink(fullname)) rc = CL_EUNLINK;
2008-02-12 00:58:49 +00:00
if(rc != CL_CLEAN) break;
}
cli_dbgmsg("cli_pdf: returning %d\n", rc);
return rc;
2005-04-27 12:45:17 +00:00
}
/*
* flate inflation
*/
static int
try_flatedecode(unsigned char *buf, off_t real_len, off_t calculated_len, int fout, cli_ctx *ctx)
{
2008-02-11 20:19:20 +00:00
int ret = cli_checklimits("cli_pdf", ctx, real_len, 0, 0);
2008-02-11 20:19:20 +00:00
if (ret==CL_CLEAN && flatedecode(buf, real_len, fout, ctx) == CL_SUCCESS)
2008-02-12 00:58:49 +00:00
return CL_CLEAN;
2007-03-30 11:32:32 +00:00
if(real_len == calculated_len) {
/*
* Nothing more we can do to inflate
*/
cli_dbgmsg("cli_pdf: Bad compression in flate stream\n");
return CL_CLEAN;
2007-03-30 11:32:32 +00:00
}
2008-02-11 20:19:20 +00:00
if(cli_checklimits("cli_pdf", ctx, calculated_len, 0, 0)!=CL_CLEAN)
return CL_CLEAN;
ret = flatedecode(buf, calculated_len, fout, ctx);
2008-02-12 00:58:49 +00:00
if(ret == CL_CLEAN)
return CL_CLEAN;
/* i.e. the PDF file is broken :-( */
cli_dbgmsg("cli_pdf: Bad compressed block length in flate stream\n");
return ret;
}
static int
flatedecode(unsigned char *buf, off_t len, int fout, cli_ctx *ctx)
{
int zstat, ret;
off_t nbytes;
z_stream stream;
unsigned char output[BUFSIZ];
#ifdef SAVE_TMP
char tmpfilename[16];
int tmpfd;
#endif
cli_dbgmsg("cli_pdf: flatedecode %lu bytes\n", (unsigned long)len);
2007-04-16 11:52:03 +00:00
if(len == 0) {
cli_dbgmsg("cli_pdf: flatedecode len == 0\n");
return CL_CLEAN;
2007-04-16 11:52:03 +00:00
}
#ifdef SAVE_TMP
/*
* Copy the embedded area for debugging, so that if it falls over
* we have a copy of the offending data. This is debugging code
* that you shouldn't of course install in a live environment. I am
* not interested in hearing about security issues with this section
* of the parser.
*/
strcpy(tmpfilename, "/tmp/pdfXXXXXX");
tmpfd = mkstemp(tmpfilename);
if(tmpfd < 0) {
perror(tmpfilename);
cli_errmsg("cli_pdf: Can't make debugging file\n");
} else {
FILE *tmpfp = fdopen(tmpfd, "w");
if(tmpfp) {
fwrite(buf, sizeof(char), len, tmpfp);
fclose(tmpfp);
2007-03-09 22:39:06 +00:00
cli_dbgmsg("cli_pdf: flatedecode: debugging file is %s\n",
tmpfilename);
} else
cli_errmsg("cli_pdf: can't fdopen debugging file\n");
}
#endif
stream.zalloc = (alloc_func)Z_NULL;
stream.zfree = (free_func)Z_NULL;
stream.opaque = (void *)NULL;
2007-02-12 20:47:28 +00:00
stream.next_in = (Bytef *)buf;
stream.avail_in = len;
stream.next_out = output;
stream.avail_out = sizeof(output);
zstat = inflateInit(&stream);
if(zstat != Z_OK) {
2008-10-07 13:35:21 +00:00
cli_warnmsg("cli_pdf: inflateInit failed\n");
return CL_EMEM;
}
nbytes = 0;
2007-03-30 21:18:37 +00:00
while(stream.avail_in) {
2007-01-25 14:01:06 +00:00
zstat = inflate(&stream, Z_NO_FLUSH); /* zlib */
switch(zstat) {
case Z_OK:
if(stream.avail_out == 0) {
int written;
if ((written=cli_writen(fout, output, sizeof(output)))!=sizeof(output)) {
cli_errmsg("cli_pdf: failed to write output file\n");
inflateEnd(&stream);
return CL_EWRITE;
}
nbytes += written;
if((ret=cli_checklimits("cli_pdf", ctx, nbytes, 0, 0))!=CL_CLEAN) {
inflateEnd(&stream);
return ret;
}
stream.next_out = output;
stream.avail_out = sizeof(output);
}
continue;
case Z_STREAM_END:
break;
default:
if(stream.msg)
cli_dbgmsg("cli_pdf: after writing %lu bytes, got error \"%s\" inflating PDF attachment\n",
(unsigned long)nbytes,
stream.msg);
else
cli_dbgmsg("cli_pdf: after writing %lu bytes, got error %d inflating PDF attachment\n",
(unsigned long)nbytes, zstat);
inflateEnd(&stream);
return CL_CLEAN;
}
break;
}
if(stream.avail_out != sizeof(output)) {
if(cli_writen(fout, output, sizeof(output) - stream.avail_out) < 0) {
cli_errmsg("cli_pdf: failed to write output file\n");
inflateEnd(&stream);
return CL_EWRITE;
}
}
#ifdef SAVE_TMP
if (cli_unlink(tmpfilename)) {
inflateEnd(&stream);
return CL_EUNLINK;
}
#endif
inflateEnd(&stream);
return CL_CLEAN;
}
2006-04-11 11:57:39 +00:00
/*
* ascii85 inflation, returns number of bytes in output, -1 for error
*
* See http://www.piclist.com/techref/method/encode.htm (look for base85)
*/
static int
2007-02-10 14:23:52 +00:00
ascii85decode(const char *buf, off_t len, unsigned char *output)
{
2006-04-11 11:57:39 +00:00
const char *ptr;
uint32_t sum = 0;
int quintet = 0;
int ret = 0;
if(cli_memstr(buf, len, "~>", 2) == NULL)
cli_dbgmsg("cli_pdf: ascii85decode: no EOF marker found\n");
2006-04-11 11:57:39 +00:00
ptr = buf;
cli_dbgmsg("cli_pdf: ascii85decode %lu bytes\n", (unsigned long)len);
while(len > 0) {
int byte = (len--) ? (int)*ptr++ : EOF;
if((byte == '~') && (*ptr == '>'))
byte = EOF;
if(byte >= '!' && byte <= 'u') {
sum = (sum * 85) + ((uint32_t)byte - '!');
if(++quintet == 5) {
2005-06-01 12:58:44 +00:00
*output++ = (unsigned char)(sum >> 24);
*output++ = (unsigned char)((sum >> 16) & 0xFF);
*output++ = (unsigned char)((sum >> 8) & 0xFF);
*output++ = (unsigned char)(sum & 0xFF);
ret += 4;
quintet = 0;
sum = 0;
}
} else if(byte == 'z') {
if(quintet) {
cli_dbgmsg("ascii85decode: unexpected 'z'\n");
return -1;
}
*output++ = '\0';
*output++ = '\0';
*output++ = '\0';
*output++ = '\0';
ret += 4;
} else if(byte == EOF) {
2006-04-11 11:57:39 +00:00
cli_dbgmsg("ascii85decode: quintet %d\n", quintet);
if(quintet) {
int i;
if(quintet == 1) {
cli_dbgmsg("ascii85Decode: only 1 byte in last quintet\n");
return -1;
}
for(i = quintet; i < 5; i++)
sum *= 85;
if(quintet > 1)
sum += (0xFFFFFF >> ((quintet - 2) * 8));
ret += quintet;
for(i = 0; i < quintet - 1; i++)
2005-06-01 12:58:44 +00:00
*output++ = (unsigned char)((sum >> (24 - 8 * i)) & 0xFF);
quintet = 0;
}
2005-05-25 13:37:33 +00:00
len = 0;
break;
} else if(!isspace(byte)) {
cli_dbgmsg("ascii85Decode: invalid character 0x%x, len %lu\n",
2007-02-12 20:47:28 +00:00
byte & 0xFF, (unsigned long)len);
return -1;
}
}
return ret;
}
/*
* Find the start of the next line
*/
static const char *
pdf_nextlinestart(const char *ptr, size_t len)
{
while(strchr("\r\n", *ptr) == NULL) {
if(--len == 0L)
return NULL;
ptr++;
}
while(strchr("\r\n", *ptr) != NULL) {
if(--len == 0L)
return NULL;
ptr++;
}
return ptr;
}
2005-07-30 10:08:59 +00:00
/*
* Return the start of the next PDF object.
* This assumes that we're not in a stream.
*/
static const char *
pdf_nextobject(const char *ptr, size_t len)
{
const char *p;
int inobject = 1;
while(len) {
switch(*ptr) {
case '\n':
case '\r':
case '%': /* comment */
p = pdf_nextlinestart(ptr, len);
if(p == NULL)
return NULL;
len -= (size_t)(p - ptr);
ptr = p;
inobject = 0;
break;
case ' ':
case '\t':
case '[': /* Start of an array object */
2005-07-30 10:08:59 +00:00
case '\v':
case '\f':
case '<': /* Start of a dictionary object */
2005-07-30 10:08:59 +00:00
inobject = 0;
ptr++;
len--;
break;
case '/': /* Start of a name object */
return ptr;
default:
2005-07-30 10:08:59 +00:00
if(!inobject)
/* TODO: parse and return object type */
return ptr;
ptr++;
len--;
}
}
return NULL;
}