25 #include "nsUniversalDetector.h" 36 nsUniversalDetector(NS_FILTER_ALL),
48 nsUniversalDetector(NS_FILTER_ALL),
60 nsUniversalDetector(NS_FILTER_ALL),
119 size_t size = file.
Length();
132 buffer[size + 0] = 0;
133 buffer[size + 1] = 0;
134 buffer[size + 2] = 0;
135 buffer[size + 3] = 0;
137 size_t readBytes = file.
Read((
void*)buffer, size);
152 if (cfgMgr->
ReadInt(
_T(
"/default_encoding/use_option"), 0) == 1)
160 msg.
Printf(
_T(
"Warning: bypassing C::B's auto-detection!\n" 161 "Encoding requested is: %s (ID: %d)"),
178 msg.
Printf(
_T(
"Detected encoding via BOM: %s (ID: %d)"),
188 Reset(); nsresult res = HandleData((
char*)buffer, size);
209 msg.
Printf(
_T(
"Text seems to be pure ASCII!\n" 210 "We use user specified encoding: %s (ID: %d)"),
224 msg.
Printf(
_T(
"Warning: Using user specified encoding as fallback!\n" 225 "Encoding fallback is: %s (ID: %d)"),
240 msg.
Printf(
_T(
"Final encoding detected: %s (ID: %d)"),
258 if (!buffer || size == 0)
return false;
260 const wxByte* buff_ptr = buffer;
261 const wxByte* buff_end = &buffer[size];
267 if (size >= 4 && memcmp(buffer,
"\xFF\xFE\x00\x00", 4) == 0)
273 else if (size >= 4 && memcmp(buffer,
"\xFE\xFF\x00\x00", 4) == 0)
279 else if (size >= 4 && memcmp(buffer,
"\x00\x00\xFE\xFF", 4) == 0)
285 else if (size >= 4 && memcmp(buffer,
"\x00\x00\xFF\xFE", 4) == 0)
291 else if ( memcmp(buffer,
"\xFF\xFE", 2) == 0)
297 else if ( memcmp(buffer,
"\xFE\xFF", 2) == 0)
303 else if (size >= 3 && memcmp(buffer,
"\xEF\xBB\xBF", 3) == 0)
309 else if (size >= 5 && memcmp(buffer,
"\x2B\x2F\x76\x38\x2D", 5) == 0)
332 unsigned int null_byte_count = 0;
333 unsigned int utf_bytes = 0;
334 unsigned int good_utf_count = 0;
335 unsigned int bad_utf_count = 0;
336 unsigned int bad_utf32_count = 0;
337 unsigned int bad_utf16_count = 0;
338 unsigned int nl_utf32le_count = 0;
339 unsigned int nl_utf32be_count = 0;
340 unsigned int nl_utf16le_count = 0;
341 unsigned int nl_utf16be_count = 0;
343 while (buff_ptr != buff_end)
345 if (*buff_ptr == 0) ++null_byte_count;
350 if ((*buff_ptr & 0xC0) == 0x80 || *buff_ptr == 0)
354 const char c = *buff_ptr;
356 if ((c & 0x80) == 0x00) utf_bytes = 1;
357 else if ((c & 0xE0) == 0xC0) utf_bytes = 2;
358 else if ((c & 0xF0) == 0xE0) utf_bytes = 3;
359 else if ((c & 0xF8) == 0xF0) utf_bytes = 4;
367 else if ((*buff_ptr & 0xC0) == 0x80)
380 if ((
wxUIntPtr)buff_ptr % 4 == 0 && buff_ptr+4 <= buff_end)
382 if (*((
wxUint32*)buff_ptr) == 0 ) ++bad_utf32_count;
388 if ((
wxUIntPtr)buff_ptr % 2 == 0 && buff_ptr+4 <= buff_end)
390 if (*((
wxUint16*)buff_ptr) == 0) ++bad_utf16_count;
403 else if (null_byte_count)
436 if (!buffer || size == 0)
440 logmsg.
Printf(
_T(
"Encoding conversion has failed (buffer is empty)!"));
463 wideBuff = conv.cMB2WC((
const char*)buffer, size + 4 -
m_BOMSizeInBytes, &outlen);
468 wideBuff = conv.cMB2WC((
const char*)buffer, size + 4 -
m_BOMSizeInBytes, &outlen);
472 wxMBConvUTF16BE conv;
473 wideBuff = conv.cMB2WC((
const char*)buffer, size + 4 -
m_BOMSizeInBytes, &outlen);
477 wxMBConvUTF16LE conv;
478 wideBuff = conv.cMB2WC((
const char*)buffer, size + 4 -
m_BOMSizeInBytes, &outlen);
482 wxMBConvUTF32BE conv;
483 wideBuff = conv.cMB2WC((
const char*)buffer, size + 4 -
m_BOMSizeInBytes, &outlen);
487 wxMBConvUTF32LE conv;
488 wideBuff = conv.cMB2WC((
const char*)buffer, size + 4 -
m_BOMSizeInBytes, &outlen);
498 && conv.
Convert((
const char*)buffer, tmp) )
504 logmsg.
Printf(
_T(
"Conversion succeeded using wxEncodingConverter " 505 "(buffer size = %lu, converted size = %lu."), static_cast<unsigned long>(size), static_cast<unsigned long>(outlen));
515 wideBuff = csconv.cMB2WC((
const char*)buffer, size + 4 -
m_BOMSizeInBytes, &outlen);
518 logmsg.
Printf(
_T(
"Conversion succeeded using wxCSConv " 519 "(buffer size = %lu, converted size = %lu."), static_cast<unsigned long>(size), static_cast<unsigned long>(outlen));
536 logmsg.
Printf(
_T(
"Encoding conversion using settings has failed!\n" 537 "Encoding chosen was: %s (ID: %d)"),
545 if (cfgMgr->
ReadBool(
_T(
"/default_encoding/use_system"),
true))
547 if (platform::windows)
550 logmgr->
DebugLog(
_T(
"Trying system locale as fallback..."));
558 logmgr->
DebugLog(
_T(
"Trying ISO-8859-1 as fallback..."));
564 wideBuff = conv_system.cMB2WC((
const char*)buffer, size + 4 -
m_BOMSizeInBytes, &outlen);
571 logmsg.
Printf(
_T(
"Encoding conversion using system locale fallback has failed!\n" 572 "Last encoding choosen was: %s (ID: %d)\n" 573 "Don't know what to do."),
585 logmgr->
DebugLog(
_T(
"Encoding conversion has seriously failed!\n" 586 "Don't know what to do."));
static wxFontEncoding GetEncodingFromName(const wxString &encoding)
wxString F(const wxChar *msg,...)
sprintf-like function
#define wxUINT32_SWAP_ON_BE(wxUint32_value)
wxFontEncoding m_Encoding
ConfigManager * GetConfigManager(const wxString &name_space) const
int ReadInt(const wxString &name, int defaultVal=0)
bool DetectEncoding(const wxString &filename, bool convert_to_wxstring=true)
static Manager * Get()
Use Manager::Get() to get a pointer to its instance Manager::Get() is guaranteed to never return an i...
#define wxUINT16_SWAP_ON_LE(wxUint16_value)
void resize(size_t nSize, wxUniChar ch='\0')
wxFileOffset Length() const
bool ReadBool(const wxString &name, bool defaultVal=false)
bool DetectEncodingEx(const wxByte *buffer, size_t len)
void Report(const char *aCharset) override
#define wxUINT32_SWAP_ON_LE(wxUint32_value)
wxString makeStringNoNull(const wxWCharBuffer &wideBuff)
Convert the char buffer to wxString and if there are any null-terminating characters at the end - rem...
bool Contains(const wxString &str) const
DLLIMPORT wxString cbC2U(const char *str)
Return str as a proper unicode-compatible string.
static wxString GetSystemEncodingName()
LogManager * GetLogManager() const
wxString Read(const wxString &key, const wxString &defaultVal=wxEmptyString)
#define wxUINT16_SWAP_ON_BE(wxUint16_value)
wxString GetWxStr() const
size_t find_last_not_of(const wxString &str, size_t nStart=npos) const
const wxStringCharType * wx_str() const
static wxFontEncoding GetSystemEncoding()
ssize_t Read(void *buffer, size_t count)
EncodingDetector(const wxString &filename, bool useLog=true)
bool ConvertToWxString(const wxByte *buffer, size_t size)
int GetBOMSizeInBytes() const
void DebugLog(const wxString &msg, Logger::level lv=Logger::info)
bool Init(wxFontEncoding input_enc, wxFontEncoding output_enc, int method=wxCONVERT_STRICT)
wxFontEncoding GetFontEncoding() const
bool Convert(const char *input, char *output) const
virtual wxFontEncoding CharsetToEncoding(const wxString &charset, bool interactive=true)
static wxFontMapper * Get()
int Printf(const wxString &pszFormat,...)
~EncodingDetector() override