#include "drw_textcodec.h" #include #include #include #include "../drw_base.h" #include "drw_cptables.h" #include "drw_cptable932.h" #include "drw_cptable936.h" #include "drw_cptable949.h" #include "drw_cptable950.h" DRW_TextCodec::DRW_TextCodec() { version = DRW::AC1021; conv = new DRW_Converter( nullptr, 0 ); } DRW_TextCodec::~DRW_TextCodec() { delete conv; } void DRW_TextCodec::setVersion( int v, bool dxfFormat ) { if ( v == DRW::AC1009 || v == DRW::AC1006 ) { version = DRW::AC1009; cp = "ANSI_1252"; setCodePage( &cp, dxfFormat ); } else if ( v == DRW::AC1012 || v == DRW::AC1014 || v == DRW::AC1015 || v == DRW::AC1018 ) { version = DRW::AC1015; // if (cp.empty()) { //codepage not set, initialize cp = "ANSI_1252"; setCodePage( &cp, dxfFormat ); // } } else { version = DRW::AC1021; if ( dxfFormat ) cp = "UTF-8";//RLZ: can be UCS2 or UTF-16 16bits per char else cp = "UTF-16";//RLZ: can be UCS2 or UTF-16 16bits per char setCodePage( &cp, dxfFormat ); } } void DRW_TextCodec::setVersion( std::string *v, bool dxfFormat ) { std::string versionStr = *v; if ( versionStr == "AC1009" || versionStr == "AC1006" ) { setVersion( DRW::AC1009, dxfFormat ); } else if ( versionStr == "AC1012" || versionStr == "AC1014" || versionStr == "AC1015" || versionStr == "AC1018" ) { setVersion( DRW::AC1015, dxfFormat ); } setVersion( DRW::AC1021, dxfFormat ); } void DRW_TextCodec::setCodePage( std::string *c, bool dxfFormat ) { cp = correctCodePage( *c ); delete conv; if ( version == DRW::AC1009 || version == DRW::AC1015 ) { if ( cp == "ANSI_874" ) conv = new DRW_ConvTable( DRW_Table874, CPLENGTHCOMMON ); else if ( cp == "ANSI_932" ) conv = new DRW_Conv932Table( DRW_Table932, DRW_LeadTable932, DRW_DoubleTable932, CPLENGTH932 ); else if ( cp == "ANSI_936" ) conv = new DRW_ConvDBCSTable( DRW_Table936, DRW_LeadTable936, DRW_DoubleTable936, CPLENGTH936 ); else if ( cp == "ANSI_949" ) conv = new DRW_ConvDBCSTable( DRW_Table949, DRW_LeadTable949, DRW_DoubleTable949, CPLENGTH949 ); else if ( cp == "ANSI_950" ) conv = new DRW_ConvDBCSTable( DRW_Table950, DRW_LeadTable950, DRW_DoubleTable950, CPLENGTH950 ); else if ( cp == "ANSI_1250" ) conv = new DRW_ConvTable( DRW_Table1250, CPLENGTHCOMMON ); else if ( cp == "ANSI_1251" ) conv = new DRW_ConvTable( DRW_Table1251, CPLENGTHCOMMON ); else if ( cp == "ANSI_1253" ) conv = new DRW_ConvTable( DRW_Table1253, CPLENGTHCOMMON ); else if ( cp == "ANSI_1254" ) conv = new DRW_ConvTable( DRW_Table1254, CPLENGTHCOMMON ); else if ( cp == "ANSI_1255" ) conv = new DRW_ConvTable( DRW_Table1255, CPLENGTHCOMMON ); else if ( cp == "ANSI_1256" ) conv = new DRW_ConvTable( DRW_Table1256, CPLENGTHCOMMON ); else if ( cp == "ANSI_1257" ) conv = new DRW_ConvTable( DRW_Table1257, CPLENGTHCOMMON ); else if ( cp == "ANSI_1258" ) conv = new DRW_ConvTable( DRW_Table1258, CPLENGTHCOMMON ); else if ( cp == "UTF-8" ) //DXF older than 2007 are write in win codepages { cp = "ANSI_1252"; conv = new DRW_Converter( nullptr, 0 ); } else conv = new DRW_ConvTable( DRW_Table1252, CPLENGTHCOMMON ); } else { if ( dxfFormat ) conv = new DRW_Converter( nullptr, 0 );//utf16 to utf8 else conv = new DRW_ConvUTF16();//utf16 to utf8 } } std::string DRW_TextCodec::toUtf8( std::string s ) { return conv->toUtf8( &s ); } std::string DRW_TextCodec::fromUtf8( std::string s ) { return conv->fromUtf8( &s ); } std::string DRW_Converter::toUtf8( std::string *s ) { std::string result; int j = 0; unsigned int i = 0; for ( i = 0; i < s->length(); i++ ) { unsigned char c = s->at( i ); if ( c < 0x80 ) //ascii check for /U+???? { if ( c == '\\' && i + 6 < s->length() && s->at( i + 1 ) == 'U' && s->at( i + 2 ) == '+' ) { result += s->substr( j, i - j ); result += encodeText( s->substr( i, 7 ) ); i += 6; j = i + 1; } } else if ( c < 0xE0 ) //2 bits { i++; } else if ( c < 0xF0 ) //3 bits { i += 2; } else if ( c < 0xF8 ) //4 bits { i += 3; } } result += s->substr( j ); return result; } std::string DRW_ConvTable::fromUtf8( std::string *s ) { std::string result; bool notFound; int code; int j = 0; for ( unsigned int i = 0; i < s->length(); i++ ) { unsigned char c = s->at( i ); if ( c > 0x7F ) //need to decode { result += s->substr( j, i - j ); std::string part1 = s->substr( i, 4 ); int l; code = decodeNum( part1, &l ); j = i + l; i = j - 1; notFound = true; for ( int k = 0; k < cpLength; k++ ) { if ( table[k] == code ) { result += CPOFFSET + k; //translate from table notFound = false; break; } } if ( notFound ) result += decodeText( code ); } } result += s->substr( j ); return result; } std::string DRW_ConvTable::toUtf8( std::string *s ) { std::string res; std::string::iterator it; for ( it = s->begin() ; it < s->end(); ++it ) { unsigned char c = *it; if ( c < 0x80 ) { //check for \U+ encoded text if ( c == '\\' ) { if ( it + 6 < s->end() && *( it + 1 ) == 'U' && *( it + 2 ) == '+' ) { res += encodeText( std::string( it, it + 7 ) ); it += 6; } else { res += c; //no \U+ encoded text write } } else res += c; //c!='\' ascii char write } else //end c < 0x80 { res += encodeNum( table[c - 0x80] ); //translate from table } } //end for return res; } std::string DRW_Converter::encodeText( std::string stmp ) { int code; #if defined(__APPLE__) int Succeeded = sscanf( &( stmp.substr( 3, 4 )[0] ), "%x", &code ); if ( !Succeeded || Succeeded == EOF ) code = 0; #else std::istringstream sd( stmp.substr( 3, 4 ) ); sd >> std::hex >> code; #endif return encodeNum( code ); } std::string DRW_Converter::decodeText( int c ) { std::string res = "\\U+"; std::string num; #if defined(__APPLE__) std::string str( 16, '\0' ); snprintf( &( str[0] ), 16, "%04X", c ); num = str; #else std::stringstream ss; ss << std::uppercase << std::setfill( '0' ) << std::setw( 4 ) << std::hex << c; ss >> num; #endif res += num; return res; } std::string DRW_Converter::encodeNum( int c ) { unsigned char ret[5]; if ( c < 128 ) // 0-7F US-ASCII 7 bits { ret[0] = c; ret[1] = 0; } else if ( c < 0x800 ) //80-07FF 2 bytes { ret[0] = 0xC0 | ( c >> 6 ); ret[1] = 0x80 | ( c & 0x3f ); ret[2] = 0; } else if ( c < 0x10000 ) //800-FFFF 3 bytes { ret[0] = 0xe0 | ( c >> 12 ); ret[1] = 0x80 | ( ( c >> 6 ) & 0x3f ); ret[2] = 0x80 | ( c & 0x3f ); ret[3] = 0; } else //10000-10FFFF 4 bytes { ret[0] = 0xf0 | ( c >> 18 ); ret[1] = 0x80 | ( ( c >> 12 ) & 0x3f ); ret[2] = 0x80 | ( ( c >> 6 ) & 0x3f ); ret[3] = 0x80 | ( c & 0x3f ); ret[4] = 0; } return std::string( ( char * )ret ); } /** 's' is a string with at least 4 bytes length ** returned 'b' is byte length of encoded char: 2,3 or 4 **/ int DRW_Converter::decodeNum( std::string s, int *b ) { int code = 0; unsigned char c = s.at( 0 ); if ( ( c & 0xE0 ) == 0xC0 ) //2 bytes { code = ( c & 0x1F ) << 6; code = ( s.at( 1 ) & 0x3F ) | code; *b = 2; } else if ( ( c & 0xF0 ) == 0xE0 ) //3 bytes { code = ( c & 0x0F ) << 12; code = ( ( s.at( 1 ) & 0x3F ) << 6 ) | code; code = ( s.at( 2 ) & 0x3F ) | code; *b = 3; } else if ( ( c & 0xF8 ) == 0xF0 ) //4 bytes { code = ( c & 0x07 ) << 18; code = ( ( s.at( 1 ) & 0x3F ) << 12 ) | code; code = ( ( s.at( 2 ) & 0x3F ) << 6 ) | code; code = ( s.at( 3 ) & 0x3F ) | code; *b = 4; } return code; } std::string DRW_ConvDBCSTable::fromUtf8( std::string *s ) { std::string result; bool notFound; int code; int j = 0; for ( unsigned int i = 0; i < s->length(); i++ ) { unsigned char c = s->at( i ); if ( c > 0x7F ) //need to decode { result += s->substr( j, i - j ); std::string part1 = s->substr( i, 4 ); int l; code = decodeNum( part1, &l ); j = i + l; i = j - 1; notFound = true; for ( int k = 0; k < cpLength; k++ ) { if ( doubleTable[k][1] == code ) { int data = doubleTable[k][0]; char d[3]; d[0] = data >> 8; d[1] = data & 0xFF; d[2] = '\0'; result += d; //translate from table notFound = false; break; } } if ( notFound ) result += decodeText( code ); } //direct conversion } result += s->substr( j ); return result; } std::string DRW_ConvDBCSTable::toUtf8( std::string *s ) { std::string res; std::string::iterator it; for ( it = s->begin() ; it < s->end(); ++it ) { bool notFound = true; unsigned char c = *it; if ( c < 0x80 ) { notFound = false; //check for \U+ encoded text if ( c == '\\' ) { if ( it + 6 < s->end() && *( it + 1 ) == 'U' && *( it + 2 ) == '+' ) { res += encodeText( std::string( it, it + 7 ) ); it += 6; } else { res += c; //no \U+ encoded text write } } else res += c; //c!='\' ascii char write } else if ( c == 0x80 ) //1 byte table { notFound = false; res += encodeNum( 0x20AC );//euro sign } else //2 bytes { ++it; int code = ( c << 8 ) | ( unsigned char )( *it ); int sta = leadTable[c - 0x81]; int end = leadTable[c - 0x80]; for ( int k = sta; k < end; k++ ) { if ( doubleTable[k][0] == code ) { res += encodeNum( doubleTable[k][1] ); //translate from table notFound = false; break; } } } //not found if ( notFound ) res += encodeNum( NOTFOUND936 ); } //end for return res; } std::string DRW_Conv932Table::fromUtf8( std::string *s ) { std::string result; bool notFound; int code; int j = 0; for ( unsigned int i = 0; i < s->length(); i++ ) { unsigned char c = s->at( i ); if ( c > 0x7F ) //need to decode { result += s->substr( j, i - j ); std::string part1 = s->substr( i, 4 ); int l; code = decodeNum( part1, &l ); j = i + l; i = j - 1; notFound = true; // 1 byte table if ( code > 0xff60 && code < 0xFFA0 ) { result += code - CPOFFSET932; //translate from table notFound = false; } if ( notFound && ( code < 0xF8 || ( code > 0x390 && code < 0x542 ) || ( code > 0x200F && code < 0x9FA1 ) || code > 0xF928 ) ) { for ( int k = 0; k < cpLength; k++ ) { if ( doubleTable[k][1] == code ) { int data = doubleTable[k][0]; char d[3]; d[0] = data >> 8; d[1] = data & 0xFF; d[2] = '\0'; result += d; //translate from table notFound = false; break; } } } if ( notFound ) result += decodeText( code ); } //direct conversion } result += s->substr( j ); return result; } std::string DRW_Conv932Table::toUtf8( std::string *s ) { std::string res; std::string::iterator it; for ( it = s->begin() ; it < s->end(); ++it ) { bool notFound = true; unsigned char c = *it; if ( c < 0x80 ) { notFound = false; //check for \U+ encoded text if ( c == '\\' ) { if ( it + 6 < s->end() && *( it + 1 ) == 'U' && *( it + 2 ) == '+' ) { res += encodeText( std::string( it, it + 7 ) ); it += 6; } else { res += c; //no \U+ encoded text write } } else res += c; //c!='\' ascii char write } else if ( c > 0xA0 && c < 0xE0 ) //1 byte table { notFound = false; res += encodeNum( c + CPOFFSET932 ); //translate from table } else //2 bytes { ++it; int code = ( c << 8 ) | ( unsigned char )( *it ); int sta = 0; int end = 0; if ( c > 0x80 && c < 0xA0 ) { sta = DRW_LeadTable932[c - 0x81]; end = DRW_LeadTable932[c - 0x80]; } else if ( c > 0xDF && c < 0xFD ) { sta = DRW_LeadTable932[c - 0xC1]; end = DRW_LeadTable932[c - 0xC0]; } if ( end > 0 ) { for ( int k = sta; k < end; k++ ) { if ( DRW_DoubleTable932[k][0] == code ) { res += encodeNum( DRW_DoubleTable932[k][1] ); //translate from table notFound = false; break; } } } } //not found if ( notFound ) res += encodeNum( NOTFOUND932 ); } //end for return res; } std::string DRW_ConvUTF16::fromUtf8( std::string *s ) { DRW_UNUSED( s ); //RLZ: to be written (only needed for write dwg 2007+) return std::string(); } std::string DRW_ConvUTF16::toUtf8( std::string *s ) //RLZ: pending to write { std::string res; std::string::iterator it; for ( it = s->begin() ; it < s->end(); ++it ) { unsigned char c1 = *it; unsigned char c2 = *( ++it ); duint16 ch = ( c2 << 8 ) | c1; res += encodeNum( ch ); } //end for return res; } std::string DRW_TextCodec::correctCodePage( const std::string &s ) { //stringstream cause crash in OS/X, bug#3597944 std::string cp = s; transform( cp.begin(), cp.end(), cp.begin(), toupper ); //Latin/Thai if ( cp == "ANSI_874" || cp == "CP874" || cp == "ISO8859-11" || cp == "TIS-620" ) { return "ANSI_874"; //Central Europe and Eastern Europe } else if ( cp == "ANSI_1250" || cp == "CP1250" || cp == "ISO8859-2" ) { return "ANSI_1250"; //Cyrillic script } else if ( cp == "ANSI_1251" || cp == "CP1251" || cp == "ISO8859-5" || cp == "KOI8-R" || cp == "KOI8-U" || cp == "IBM 866" ) { return "ANSI_1251"; //Western Europe } else if ( cp == "ANSI_1252" || cp == "CP1252" || cp == "LATIN1" || cp == "ISO-8859-1" || cp == "CP819" || cp == "CSISO" || cp == "IBM819" || cp == "ISO_8859-1" || cp == "APPLE ROMAN" || cp == "ISO8859-1" || cp == "ISO8859-15" || cp == "ISO-IR-100" || cp == "L1" || cp == "IBM 850" ) { return "ANSI_1252"; //Greek } else if ( cp == "ANSI_1253" || cp == "CP1253" || cp == "iso8859-7" ) { return "ANSI_1253"; //Turkish } else if ( cp == "ANSI_1254" || cp == "CP1254" || cp == "iso8859-9" || cp == "iso8859-3" ) { return "ANSI_1254"; //Hebrew } else if ( cp == "ANSI_1255" || cp == "CP1255" || cp == "iso8859-8" ) { return "ANSI_1255"; //Arabic } else if ( cp == "ANSI_1256" || cp == "CP1256" || cp == "ISO8859-6" ) { return "ANSI_1256"; //Baltic } else if ( cp == "ANSI_1257" || cp == "CP1257" || cp == "ISO8859-4" || cp == "ISO8859-10" || cp == "ISO8859-13" ) { return "ANSI_1257"; //Vietnamese } else if ( cp == "ANSI_1258" || cp == "CP1258" ) { return "ANSI_1258"; //Japanese } else if ( cp == "ANSI_932" || cp == "SHIFT-JIS" || cp == "SHIFT_JIS" || cp == "CSSHIFTJIS" || cp == "CSWINDOWS31J" || cp == "MS_KANJI" || cp == "X-MS-CP932" || cp == "X-SJIS" || cp == "EUCJP" || cp == "EUC-JP" || cp == "CSEUCPKDFMTJAPANESE" || cp == "X-EUC" || cp == "X-EUC-JP" || cp == "JIS7" ) { return "ANSI_932"; //Chinese PRC GBK (XGB) simplified } else if ( cp == "ANSI_936" || cp == "GBK" || cp == "GB2312" || cp == "CHINESE" || cp == "CN-GB" || cp == "CSGB2312" || cp == "CSGB231280" || cp == "CSISO58BG231280" || cp == "GB_2312-80" || cp == "GB231280" || cp == "GB2312-80" || cp == "ISO-IR-58" || cp == "GB18030" ) { return "ANSI_936"; //Korean } else if ( cp == "ANSI_949" || cp == "EUCKR" ) { return "ANSI_949"; //Chinese Big5 (Taiwan, Hong Kong SAR) } else if ( cp == "ANSI_950" || cp == "BIG5" || cp == "CN-BIG5" || cp == "CSBIG5" || cp == "X-X-BIG5" || cp == "BIG5-HKSCS" ) { return "ANSI_950"; //celtic /* } else if (cp=="ISO8859-14") { return "ISO8859-14"; } else if (cp=="TSCII") { return "TSCII"; //tamil }*/ } else if ( cp == "UTF-8" || cp == "UTF8" || cp == "UTF8-BIT" ) { return "UTF-8"; } else if ( cp == "UTF-16" || cp == "UTF16" || cp == "UTF16-BIT" ) { return "UTF-16"; } return "ANSI_1252"; }