KiCad PCB EDA Suite
Loading...
Searching...
No Matches
utf8.cpp
Go to the documentation of this file.
1/*
2 * This program source code file is part of KiCad, a free EDA CAD application.
3 *
4 * Copyright (C) 2013-2017 SoftPLC Corporation, Dick Hollenbeck <[email protected]>
5 * Copyright The KiCad Developers, see AUTHORS.txt for contributors.
6 *
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation; either version 2
10 * of the License, or (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program. If not, see <https://www.gnu.org/licenses/>.
19 */
20
21#include <core/utf8.h>
22#include <wx/strconv.h>
23#include <wx/buffer.h>
24#include <vector>
25
26#include <cassert>
27
28
29/*
30 These are not inlined so that code space is saved by encapsulating the
31 creation of intermediate objects and the referencing of wxConvUTF8.
32*/
33
34
35UTF8::UTF8( const wxString& o ) :
36 m_s( (const char*) o.utf8_str() )
37{
38}
39
40
41wxString UTF8::wx_str() const
42{
43 return wxString( c_str(), wxConvUTF8 );
44}
45
46
47UTF8::operator wxString () const
48{
49 return wxString( c_str(), wxConvUTF8 );
50}
51
52
53UTF8& UTF8::operator=( const wxString& o )
54{
55 m_s = (const char*) o.utf8_str();
56 return *this;
57}
58
59
60// There is no wxWidgets function that does this, because wchar_t is 16 bits
61// on windows and wx wants to encode the output in UTF16 for such.
62
63int UTF8::uni_forward( const unsigned char* aSequence, unsigned* aResult )
64{
65 unsigned ch = *aSequence;
66
67 if( ch < 0x80 )
68 {
69 if( aResult )
70 *aResult = ch;
71 return 1;
72 }
73
74 const unsigned char* s = aSequence;
75
76 static const unsigned char utf8_len[] = {
77 // Map encoded prefix byte to sequence length. Zero means
78 // illegal prefix. See RFC 3629 for details
79 /*
80 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 00-0F
81 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
82 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
83 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
84 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
85 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
86 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
87 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 70-7F
88 */
89 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 80-8F
90 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
91 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
92 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // B0-BF
93 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // C0-C1 + C2-CF
94 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // D0-DF
95 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // E0-EF
96 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 // F0-F4 + F5-FF
97 };
98
99 int len = utf8_len[ *s - 0x80 /* top half of table is missing */ ];
100
101 switch( len )
102 {
103 default:
104 case 0:
105 if( aResult )
106 wxFAIL_MSG( wxS( "uni_forward: invalid start byte" ) );
107
108 return 0;
109 break;
110
111 case 2:
112 if( ( s[1] & 0xc0 ) != 0x80 )
113 {
114 if( aResult )
115 wxFAIL_MSG( wxS( "uni_forward: invalid continuation byte" ) );
116
117 return 0;
118 }
119
120 ch = ((s[0] & 0x1f) << 6) +
121 ((s[1] & 0x3f) << 0);
122
123 // assert( ch > 0x007F && ch <= 0x07FF );
124 break;
125
126 case 3:
127 if( (s[1] & 0xc0) != 0x80 ||
128 (s[2] & 0xc0) != 0x80 ||
129 (s[0] == 0xE0 && s[1] < 0xA0)
130 // || (s[0] == 0xED && s[1] > 0x9F)
131 )
132 {
133 if( aResult )
134 wxFAIL_MSG( wxS( "uni_forward: invalid continuation byte" ) );
135
136 return 0;
137 }
138
139 ch = ((s[0] & 0x0f) << 12) +
140 ((s[1] & 0x3f) << 6 ) +
141 ((s[2] & 0x3f) << 0 );
142
143 // assert( ch > 0x07FF && ch <= 0xFFFF );
144 break;
145
146 case 4:
147 if( (s[1] & 0xc0) != 0x80 ||
148 (s[2] & 0xc0) != 0x80 ||
149 (s[3] & 0xc0) != 0x80 ||
150 (s[0] == 0xF0 && s[1] < 0x90) ||
151 (s[0] == 0xF4 && s[1] > 0x8F) )
152 {
153 if( aResult )
154 wxFAIL_MSG( wxS( "uni_forward: invalid continuation byte" ) );
155
156 return 0;
157 }
158
159 ch = ((s[0] & 0x7) << 18) +
160 ((s[1] & 0x3f) << 12) +
161 ((s[2] & 0x3f) << 6 ) +
162 ((s[3] & 0x3f) << 0 );
163
164 // assert( ch > 0xFFFF && ch <= 0x10ffff );
165 break;
166 }
167
168 if( aResult )
169 *aResult = ch;
170
171 return len;
172}
173
174
175bool IsUTF8( const char* aString )
176{
177 int len = strlen( aString );
178
179 if( len )
180 {
181 const unsigned char* next = (unsigned char*) aString;
182 const unsigned char* end = next + len;
183
184 while( next < end )
185 {
186 int charLen = UTF8::uni_forward( next, nullptr );
187
188 if( charLen == 0 )
189 return false;
190
191 next += charLen;
192 }
193
194 // uni_forward() should find the exact end if it is truly UTF8
195 if( next > end )
196 return false;
197 }
198
199 return true;
200}
201
202
203UTF8::UTF8( const wchar_t* txt )
204{
205 try
206 {
207 std::vector< char > temp( wcslen( txt ) * 4 + 1 );
208 wxConvUTF8.WC2MB( temp.data(), txt, temp.size() );
209 m_s.assign( temp.data() );
210 }
211 catch(...)
212 {
213 auto string = wxSafeConvertWX2MB( txt );
214 m_s.assign( string );
215 }
216
217 m_s.shrink_to_fit();
218}
219
220
221UTF8& UTF8::operator+=( unsigned w_ch )
222{
223 if( w_ch <= 0x7F )
224 {
225 m_s.operator+=( char( w_ch ) );
226 }
227 else
228 {
229 wchar_t wide_chr[2]; // buffer to store wide chars (UTF16) read from aText
230 wide_chr[1] = 0;
231 wide_chr[0] = w_ch;
232 UTF8 substr( wide_chr );
233 m_s += substr.m_s;
234 }
235
236 return *this;
237}
238
239
240std::ostream& operator<<( std::ostream& aStream, const UTF8& aRhs )
241{
242 aStream << static_cast<const std::string&>( aRhs );
243 return aStream;
244}
An 8 bit string that is assuredly encoded in UTF8, and supplies special conversion support to and fro...
Definition utf8.h:67
UTF8 & operator+=(const UTF8 &str)
Definition utf8.h:129
std::string m_s
Definition utf8.h:325
std::string substr(size_t pos=0, size_t len=npos) const
Definition utf8.h:201
static int uni_forward(const unsigned char *aSequence, unsigned *aResult=nullptr)
Advance over a single UTF8 encoded multibyte character, capturing the Unicode character as it goes,...
Definition utf8.cpp:63
const char * c_str() const
Definition utf8.h:104
wxString wx_str() const
Definition utf8.cpp:41
UTF8(const wxString &o)
Definition utf8.cpp:35
UTF8 & operator=(const wxString &o)
Definition utf8.cpp:53
UTF8()
Definition utf8.h:95
CITER next(CITER it)
Definition ptree.cpp:120
VECTOR2I end
bool IsUTF8(const char *aString)
Test a C string to see if it is UTF8 encoded.
Definition utf8.cpp:175
std::ostream & operator<<(std::ostream &aStream, const UTF8 &aRhs)
Definition utf8.cpp:240