Merge pull request #27 from cisco-sbg/CLAM-2752-pdf-overflow-1.4

Fix integer overflow in PDF parser (1.4.3)
2025-10-19 10:23:17 +00:00 · 2025-05-22 18:33:07 -04:00 · 2025-05-22 18:33:07 -04:00 · bca003b028
commit bca003b028
parent 8c62d0a0e6 7fe290b573
3 changed files with 87 additions and 51 deletions
--- a/libclamav/pdf.c
+++ b/libclamav/pdf.c
@ -440,7 +440,7 @@ int pdf_findobj_in_objstm(struct pdf_struct *pdf, struct objstm_struct *objstm,

        if (CL_SUCCESS != cli_strntol_wrap(index, bytes_remaining, 0, 10, &temp_long)) {
            /* Failed to find obj offset for next obj */
-            cli_dbgmsg("pdf_findobj_in_objstm: Failed to find next obj offset for obj in object stream though there should be {%u} more.\n", objstm->n - objstm->nobjs_found);
+            cli_dbgmsg("pdf_findobj_in_objstm: Failed to find next obj offset for obj in object stream though there should be {%zu} more.\n", objstm->n - objstm->nobjs_found);
            status = CL_EPARSE;
            goto done;
        } else if (temp_long < 0) {
@ -1563,18 +1563,18 @@ cl_error_t pdf_extract_obj(struct pdf_struct *pdf, struct pdf_obj *obj, uint32_t
                }
            }

-            cli_dbgmsg("pdf_extract_obj: calculated length %lld\n", (long long)length);
+            cli_dbgmsg("pdf_extract_obj: calculated length %zu\n", length);
        } else {
            if (obj->stream_size > (size_t)length + 2) {
                cli_dbgmsg("cli_pdf: calculated length %zu < %zu\n",
-                           (size_t)length, obj->stream_size);
+                           length, obj->stream_size);
                length = obj->stream_size;
            }
        }

-        if ((0 != orig_length) && (obj->stream_size > (size_t)orig_length + 20)) {
-            cli_dbgmsg("pdf_extract_obj: orig length: %lld, length: %lld, size: %zu\n",
-                       (long long)orig_length, (long long)length, obj->stream_size);
+        if ((0 != orig_length) && (obj->stream_size > orig_length + 20)) {
+            cli_dbgmsg("pdf_extract_obj: orig length: %zu, length: %zu, size: %zu\n",
+                       orig_length, length, obj->stream_size);
            pdfobj_flag(pdf, obj, BAD_STREAMLEN);
        }

@ -1628,18 +1628,18 @@ cl_error_t pdf_extract_obj(struct pdf_struct *pdf, struct pdf_obj *obj, uint32_t
         */
        dict_len = obj->stream - start;
        if (NULL != (pstr = pdf_getdict(start, &dict_len, "/Type/ObjStm"))) {
-            int32_t objstm_first  = -1;
-            int32_t objstm_length = -1;
-            int32_t objstm_n      = -1;
+            int objstm_first  = -1;
+            int objstm_length = -1;
+            int objstm_n      = -1;

            cli_dbgmsg("pdf_extract_obj: Found /Type/ObjStm\n");

            dict_len = obj->stream - start;
-            if ((-1 == (objstm_first = pdf_readint(start, dict_len, "/First")))) {
+            if (-1 == (objstm_first = pdf_readint(start, dict_len, "/First"))) {
                cli_warnmsg("pdf_extract_obj: Failed to find offset of first object in object stream\n");
-            } else if ((-1 == (objstm_length = pdf_readint(start, dict_len, "/Length")))) {
+            } else if (-1 == (objstm_length = pdf_readint(start, dict_len, "/Length"))) {
                cli_warnmsg("pdf_extract_obj: Failed to find length of object stream\n");
-            } else if ((-1 == (objstm_n = pdf_readint(start, dict_len, "/N")))) {
+            } else if (-1 == (objstm_n = pdf_readint(start, dict_len, "/N"))) {
                cli_warnmsg("pdf_extract_obj: Failed to find num objects in object stream\n");
            } else {
                /* Add objstm to pdf struct, so it can be freed eventually */
@ -1661,19 +1661,19 @@ cl_error_t pdf_extract_obj(struct pdf_struct *pdf, struct pdf_obj *obj, uint32_t

                memset(objstm, 0, sizeof(*objstm));

-                objstm->first        = (uint32_t)objstm_first;
-                objstm->current      = (uint32_t)objstm_first;
+                objstm->first        = (size_t)objstm_first;
+                objstm->current      = (size_t)objstm_first;
                objstm->current_pair = 0;
-                objstm->length       = (uint32_t)objstm_length;
-                objstm->n            = (uint32_t)objstm_n;
+                objstm->length       = (size_t)objstm_length;
+                objstm->n            = (size_t)objstm_n;

-                cli_dbgmsg("pdf_extract_obj: ObjStm first obj at offset %d\n", objstm->first);
-                cli_dbgmsg("pdf_extract_obj: ObjStm length is %d bytes\n", objstm->length);
-                cli_dbgmsg("pdf_extract_obj: ObjStm should contain %d objects\n", objstm->n);
+                cli_dbgmsg("pdf_extract_obj: ObjStm first obj at offset %zu\n", objstm->first);
+                cli_dbgmsg("pdf_extract_obj: ObjStm length is %zu bytes\n", objstm->length);
+                cli_dbgmsg("pdf_extract_obj: ObjStm should contain %zu objects\n", objstm->n);
            }
        }

-        sum = pdf_decodestream(pdf, obj, dparams, obj->stream, (uint32_t)length, xref, fout, &rc, objstm);
+        sum = pdf_decodestream(pdf, obj, dparams, obj->stream, length, xref, fout, &rc, objstm);
        if ((CL_SUCCESS != rc) && (CL_VIRUS != rc)) {
            cli_dbgmsg("Error decoding stream! Error code: %d\n", rc);

@ -3535,7 +3535,7 @@ cl_error_t pdf_find_and_parse_objs_in_objstm(struct pdf_struct *pdf, struct objs
        retval = pdf_findobj_in_objstm(pdf, objstm, &obj);
        if (retval != CL_SUCCESS) {
            if (retval != CL_BREAK) {
-                cli_dbgmsg("pdf_find_and_parse_objs_in_objstm: Fewer objects in stream than expected: %u found, %u expected.\n",
+                cli_dbgmsg("pdf_find_and_parse_objs_in_objstm: Fewer objects in stream than expected: %zu found, %zu expected.\n",
                           objstm->nobjs_found, objstm->n);
                badobjects++;
                pdf->stats.ninvalidobjs++;
--- a/libclamav/pdf.h
+++ b/libclamav/pdf.h
@ -27,14 +27,14 @@
 #define PDF_OBJECT_RECURSION_LIMIT 25

 struct objstm_struct {
-    uint32_t first;        // offset of first obj
-    uint32_t current;      // offset of current obj
-    uint32_t current_pair; // offset of current pair describing id, location of object
-    uint32_t length;       // total length of all objects (starting at first)
-    uint32_t n;            // number of objects that should be found in the object stream
-    uint32_t nobjs_found;  // number of objects actually found in the object stream
-    char *streambuf;       // address of stream buffer, beginning with first obj pair
-    size_t streambuf_len;  // length of stream buffer, includes pairs followed by actual objects
+    size_t first;         // offset of first obj
+    size_t current;       // offset of current obj
+    size_t current_pair;  // offset of current pair describing id, location of object
+    size_t length;        // total length of all objects (starting at first)
+    size_t n;             // number of objects that should be found in the object stream
+    size_t nobjs_found;   // number of objects actually found in the object stream
+    char *streambuf;      // address of stream buffer, beginning with first obj pair
+    size_t streambuf_len; // length of stream buffer, includes pairs followed by actual objects
 };

 struct pdf_obj {
--- a/libclamav/pdfdecode.c
+++ b/libclamav/pdfdecode.c
@ -73,7 +73,7 @@
 struct pdf_token {
    uint32_t flags;   /* tracking flags */
    uint32_t success; /* successfully decoded filters */
-    uint32_t length;  /* length of current content; TODO: transition to size_t */
+    size_t length;    /* length of current content; TODO: transition to size_t */
    uint8_t *content; /* content stream */
 };

@ -401,10 +401,16 @@ static cl_error_t filter_ascii85decode(struct pdf_struct *pdf, struct pdf_obj *o
    uint32_t declen = 0;

    const uint8_t *ptr = (uint8_t *)token->content;
-    uint32_t remaining = token->length;
+    size_t remaining   = token->length;
    int quintet = 0, rc = CL_SUCCESS;
    uint64_t sum = 0;

+    /* Check for overflow */
+    if (remaining > (SIZE_MAX / 4)) {
+        cli_dbgmsg("cli_pdf: ascii85decode: overflow detected\n");
+        return CL_EFORMAT;
+    }
+
    /* 5:4 decoding ratio, with 1:4 expansion sequences => (4*length)+1 */
    if (!(dptr = decoded = (uint8_t *)cli_max_malloc((4 * remaining) + 1))) {
        cli_errmsg("cli_pdf: cannot allocate memory for decoded output\n");
@ -791,8 +797,8 @@ static cl_error_t filter_asciihexdecode(struct pdf_struct *pdf, struct pdf_obj *
    uint8_t *decoded;

    const uint8_t *content = (uint8_t *)token->content;
-    uint32_t length        = token->length;
-    uint32_t i, j;
+    size_t length          = token->length;
+    size_t i, j;
    cl_error_t rc = CL_SUCCESS;

    if (!(decoded = (uint8_t *)cli_max_calloc(length / 2 + 1, sizeof(uint8_t)))) {
@ -822,8 +828,8 @@ static cl_error_t filter_asciihexdecode(struct pdf_struct *pdf, struct pdf_obj *
    if (rc == CL_SUCCESS) {
        free(token->content);

-        cli_dbgmsg("cli_pdf: deflated %lu bytes from %lu total bytes\n",
-                   (unsigned long)j, (unsigned long)(token->length));
+        cli_dbgmsg("cli_pdf: deflated %zu bytes from %zu total bytes\n",
+                   j, token->length);

        token->content = decoded;
        token->length  = j;
@ -831,8 +837,8 @@ static cl_error_t filter_asciihexdecode(struct pdf_struct *pdf, struct pdf_obj *
        if (!(obj->flags & ((1 << OBJ_IMAGE) | (1 << OBJ_TRUNCATED))))
            pdfobj_flag(pdf, obj, BAD_ASCIIDECODE);

-        cli_dbgmsg("cli_pdf: error occurred parsing byte %lu of %lu\n",
-                   (unsigned long)i, (unsigned long)(token->length));
+        cli_dbgmsg("cli_pdf: error occurred parsing byte %zu of %zu\n",
+                   i, token->length);
        free(decoded);
    }
    return rc;
@ -873,27 +879,29 @@ static cl_error_t filter_decrypt(struct pdf_struct *pdf, struct pdf_obj *obj, st
        return CL_EPARSE; /* TODO: what should this value be? CL_SUCCESS would mirror previous behavior */
    }

-    cli_dbgmsg("cli_pdf: decrypted %zu bytes from %u total bytes\n",
+    cli_dbgmsg("cli_pdf: decrypted %zu bytes from %zu total bytes\n",
               length, token->length);

    free(token->content);
    token->content = (uint8_t *)decrypted;
-    token->length  = (uint32_t)length; /* this may truncate unfortunately, TODO: use 64-bit values internally? */
+    token->length  = length;
    return CL_SUCCESS;
 }

 static cl_error_t filter_lzwdecode(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdf_dict *params, struct pdf_token *token)
 {
    uint8_t *decoded, *temp;
-    uint32_t declen = 0, capacity = 0;
+    size_t declen = 0, capacity = 0;

    uint8_t *content = (uint8_t *)token->content;
    uint32_t length  = token->length;
    lzw_stream stream;
    int echg = 1, lzwstat, rc = CL_SUCCESS;

-    if (pdf->ctx && !(pdf->ctx->dconf->other & OTHER_CONF_LZW))
-        return CL_BREAK;
+    if (pdf->ctx && !(pdf->ctx->dconf->other & OTHER_CONF_LZW)) {
+        rc = CL_BREAK;
+        goto done;
+    }

    if (params) {
        struct pdf_dict_node *node = params->nodes;
@ -924,15 +932,18 @@ static cl_error_t filter_lzwdecode(struct pdf_struct *pdf, struct pdf_obj *obj,
         * Sample 0015315109, it has \r followed by zlib header.
         * Flag pdf as suspicious, and attempt to extract by skipping the \r.
         */
-        if (!length)
-            return CL_SUCCESS;
+        if (!length) {
+            rc = CL_SUCCESS;
+            goto done;
+        }
    }

    capacity = INFLATE_CHUNK_SIZE;

    if (!(decoded = (uint8_t *)malloc(capacity))) {
        cli_errmsg("cli_pdf: cannot allocate memory for decoded output\n");
-        return CL_EMEM;
+        rc = CL_EMEM;
+        goto done;
    }

    memset(&stream, 0, sizeof(stream));
@ -947,7 +958,8 @@ static cl_error_t filter_lzwdecode(struct pdf_struct *pdf, struct pdf_obj *obj,
    if (lzwstat != Z_OK) {
        cli_warnmsg("cli_pdf: lzwInit failed\n");
        free(decoded);
-        return CL_EMEM;
+        rc = CL_EMEM;
+        goto done;
    }

    /* initial inflate */
@ -962,16 +974,23 @@ static cl_error_t filter_lzwdecode(struct pdf_struct *pdf, struct pdf_obj *obj,
            length -= q - content;
            content = q;

-            stream.next_in   = (Bytef *)content;
-            stream.avail_in  = length;
-            stream.next_out  = (Bytef *)decoded;
+            stream.next_in  = (Bytef *)content;
+            stream.avail_in = length;
+            stream.next_out = (Bytef *)decoded;
+            /* Make sure we don't overflow during type conversion */
+            if (capacity > UINT_MAX) {
+                cli_dbgmsg("cli_pdf: lzwdecode: overflow detected\n");
+                rc = CL_EFORMAT;
+                goto done;
+            }
            stream.avail_out = capacity;

            lzwstat = lzwInit(&stream);
            if (lzwstat != Z_OK) {
                cli_warnmsg("cli_pdf: lzwInit failed\n");
                free(decoded);
-                return CL_EMEM;
+                rc = CL_EMEM;
+                goto done;
            }

            pdfobj_flag(pdf, obj, BAD_FLATESTART);
@ -984,7 +1003,7 @@ static cl_error_t filter_lzwdecode(struct pdf_struct *pdf, struct pdf_obj *obj,
        /* extend output capacity if needed,*/
        if (stream.avail_out == 0) {
            if ((rc = cli_checklimits("pdf", pdf->ctx, capacity + INFLATE_CHUNK_SIZE, 0, 0)) != CL_SUCCESS) {
-                cli_dbgmsg("cli_pdf: required buffer size to inflate compressed filter exceeds maximum: %u\n", capacity + INFLATE_CHUNK_SIZE);
+                cli_dbgmsg("cli_pdf: required buffer size to inflate compressed filter exceeds maximum: %zu\n", capacity + INFLATE_CHUNK_SIZE);
                break;
            }

@ -996,7 +1015,17 @@ static cl_error_t filter_lzwdecode(struct pdf_struct *pdf, struct pdf_obj *obj,
            decoded          = temp;
            stream.next_out  = decoded + capacity;
            stream.avail_out = INFLATE_CHUNK_SIZE;
+            if (declen > (SIZE_MAX - INFLATE_CHUNK_SIZE)) {
+                cli_dbgmsg("cli_pdf: lzwdecode: overflow detected\n");
+                rc = CL_EFORMAT;
+                goto done;
+            }
            declen += INFLATE_CHUNK_SIZE;
+            if (capacity > (SIZE_MAX - INFLATE_CHUNK_SIZE)) {
+                cli_dbgmsg("cli_pdf: lzwdecode: overflow detected\n");
+                rc = CL_EFORMAT;
+                goto done;
+            }
            capacity += INFLATE_CHUNK_SIZE;
        }

@ -1004,6 +1033,12 @@ static cl_error_t filter_lzwdecode(struct pdf_struct *pdf, struct pdf_obj *obj,
        lzwstat = lzwInflate(&stream);
    }

+    if (declen > (UINT32_MAX - (INFLATE_CHUNK_SIZE - stream.avail_out))) {
+        cli_dbgmsg("cli_pdf: lzwdecode: overflow detected\n");
+        rc = CL_EFORMAT;
+        goto done;
+    }
+
    /* add stream end fragment to decoded length */
    declen += (INFLATE_CHUNK_SIZE - stream.avail_out);

@ -1044,6 +1079,7 @@ static cl_error_t filter_lzwdecode(struct pdf_struct *pdf, struct pdf_obj *obj,

    (void)lzwInflateEnd(&stream);

+done:
    if (rc == CL_SUCCESS) {
        if (declen == 0) {
            cli_dbgmsg("cli_pdf: empty stream after inflation completed.\n");