2006-12-26 16:17:02 +00:00
|
|
|
/*
|
|
|
|
* HTML Entity & Encoding normalization.
|
|
|
|
*
|
2020-01-03 15:44:07 -05:00
|
|
|
* Copyright (C) 2013-2020 Cisco Systems, Inc. and/or its affiliates. All rights reserved.
|
2019-01-25 10:15:50 -05:00
|
|
|
* Copyright (C) 2007-2013 Sourcefire, Inc.
|
2008-04-02 15:24:51 +00:00
|
|
|
*
|
|
|
|
* Authors: Török Edvin
|
2006-12-26 16:17:02 +00:00
|
|
|
*
|
|
|
|
* This program is free software; you can redistribute it and/or modify
|
2008-04-02 15:24:51 +00:00
|
|
|
* it under the terms of the GNU General Public License version 2 as
|
2007-06-30 11:50:56 +00:00
|
|
|
* published by the Free Software Foundation.
|
2006-12-26 16:17:02 +00:00
|
|
|
*
|
|
|
|
* This program is distributed in the hope that it will be useful,
|
|
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
|
|
* GNU General Public License for more details.
|
|
|
|
*
|
|
|
|
* You should have received a copy of the GNU General Public License
|
|
|
|
* along with this program; if not, write to the Free Software
|
|
|
|
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
|
|
|
|
* MA 02110-1301, USA.
|
|
|
|
*/
|
|
|
|
|
|
|
|
#ifndef _ENTITIES_H
|
|
|
|
#define _ENTITIES_H
|
2008-01-21 15:52:21 +00:00
|
|
|
|
2020-04-10 07:23:03 -07:00
|
|
|
#include "clamav-config.h"
|
|
|
|
|
|
|
|
#ifdef HAVE_ICONV
|
|
|
|
#include <iconv.h>
|
|
|
|
#endif
|
2006-12-26 16:17:02 +00:00
|
|
|
|
2020-04-10 07:23:03 -07:00
|
|
|
#include "clamav-types.h"
|
2006-12-26 16:17:02 +00:00
|
|
|
#include "hashtab.h"
|
2020-04-10 07:23:03 -07:00
|
|
|
#include "htmlnorm.h"
|
2006-12-26 16:17:02 +00:00
|
|
|
|
2008-01-25 08:37:13 +00:00
|
|
|
#define UCS4_1234 "UCS-4BE"
|
|
|
|
#define UCS4_4321 "UCS-4LE"
|
2008-01-20 22:18:14 +00:00
|
|
|
#define UCS4_2143 "UCS4"
|
|
|
|
#define UCS4_3412 "UCS-4"
|
|
|
|
#define UTF16_BE "UTF-16BE"
|
|
|
|
#define UTF16_LE "UTF-16LE"
|
2018-12-03 12:40:13 -05:00
|
|
|
#define UTF8 "UTF-8"
|
2006-12-26 16:17:02 +00:00
|
|
|
#define UNDECIDED_32_1234 UCS4_1234
|
|
|
|
#define UNDECIDED_32_4321 UCS4_4321
|
|
|
|
#define UNDECIDED_32_2143 UCS4_2143
|
|
|
|
#define UNDECIDED_32_3412 UCS4_3412
|
|
|
|
#define UNDECIDED_16_BE UTF16_BE
|
|
|
|
#define UNDECIDED_16_LE UTF16_LE
|
2008-01-20 22:18:14 +00:00
|
|
|
#define UNDECIDED_8 "ISO-8859-1"
|
2006-12-26 16:17:02 +00:00
|
|
|
|
2020-04-28 13:32:07 -07:00
|
|
|
#define CODEPAGE_ISO8859_1 28591
|
|
|
|
#define CODEPAGE_UTF16_LE 1200
|
|
|
|
#define CODEPAGE_UTF16_BE 1201
|
|
|
|
|
2006-12-26 16:17:02 +00:00
|
|
|
#define MAX_ENTITY_SIZE 22
|
|
|
|
|
|
|
|
struct entity_conv {
|
2018-12-03 12:40:13 -05:00
|
|
|
unsigned char entity_buff[MAX_ENTITY_SIZE + 2];
|
2006-12-26 16:17:02 +00:00
|
|
|
};
|
|
|
|
|
2018-12-03 12:40:13 -05:00
|
|
|
enum encodings { E_UCS4,
|
|
|
|
E_UTF16,
|
|
|
|
E_UCS4_1234,
|
|
|
|
E_UCS4_4321,
|
|
|
|
E_UCS4_2143,
|
|
|
|
E_UCS4_3412,
|
|
|
|
E_UTF16_BE,
|
|
|
|
E_UTF16_LE,
|
|
|
|
E_UTF8,
|
|
|
|
E_UNKNOWN,
|
|
|
|
E_OTHER };
|
2008-02-02 17:10:35 +00:00
|
|
|
|
2008-01-23 15:43:32 +00:00
|
|
|
unsigned char* u16_normalize_tobuffer(uint16_t u16, unsigned char* dst, size_t dst_size);
|
2018-12-03 12:40:13 -05:00
|
|
|
const char* entity_norm(struct entity_conv* conv, const unsigned char* entity);
|
2008-02-03 08:57:05 +00:00
|
|
|
const char* encoding_detect_bom(const unsigned char* bom, const size_t length);
|
2008-02-01 19:38:52 +00:00
|
|
|
int encoding_normalize_toascii(const m_area_t* in_m_area, const char* initial_encoding, m_area_t* out_m_area);
|
2006-12-26 16:17:02 +00:00
|
|
|
|
2020-04-10 07:23:03 -07:00
|
|
|
/**
|
|
|
|
* @brief Convert string to UTF-8, given Windows codepage.
|
|
|
|
*
|
|
|
|
* @param in string buffer
|
|
|
|
* @param in_size length of string buffer in bytes
|
|
|
|
* @param codepage Windows code page https://docs.microsoft.com/en-us/windows/desktop/Intl/code-page-identifiers)
|
|
|
|
* @param [out] out pointer to receive malloc'ed utf-8 buffer.
|
|
|
|
* @param [out] out_size pointer to receive size of utf-8 buffer, not including null terminating character.
|
|
|
|
* @return cl_error_t CL_SUCCESS if success. CL_BREAK if unable to because iconv is unavailable. Other error code if outright failure.
|
|
|
|
*/
|
|
|
|
cl_error_t cli_codepage_to_utf8(char* in, size_t in_size, uint16_t codepage, char** out, size_t* out_size);
|
|
|
|
|
2006-12-26 16:17:02 +00:00
|
|
|
#endif
|