KiCad PCB EDA Suite
UTF8 Class Reference

An 8 bit string that is assuredly encoded in UTF8, and supplies special conversion support to and from wxString, to and from std::string, and has non-mutating iteration over Unicode characters. More...

#include <utf8.h>

Classes

class  uni_iter
 uni_iter is a non-mutating iterator that walks through unicode code points in the UTF8 encoded string. More...
 

Public Member Functions

 UTF8 (const wxString &o)
 
 UTF8 (const char *txt)
 This is a constructor for which you could end up with non-UTF8 encoding, but that would be your fault. More...
 
 UTF8 (const wchar_t *txt)
 For use with _() function on wx 2.8. More...
 
 UTF8 (const std::string &o)
 
 UTF8 ()
 
 ~UTF8 ()
 
const char * c_str () const
 
bool empty () const
 
std::string::size_type find (char c) const
 
std::string::size_type find (char c, size_t s) const
 
void clear ()
 
std::string::size_type length () const
 
std::string::size_type size () const
 
int compare (const std::string &s) const
 
bool operator== (const UTF8 &rhs) const
 
bool operator== (const std::string &rhs) const
 
bool operator== (const char *s) const
 
std::string::size_type find_first_of (const std::string &str, std::string::size_type pos=0) const
 
UTF8operator+= (const UTF8 &str)
 
UTF8operator+= (char ch)
 
UTF8operator+= (const char *s)
 
UTF8operator+= (unsigned w_ch)
 Append a wide (unicode) char to the UTF8 string. More...
 
UTF8operator= (const wxString &o)
 
UTF8operator= (const std::string &o)
 
UTF8operator= (const char *s)
 
UTF8operator= (char c)
 
std::string substr (size_t pos=0, size_t len=npos) const
 
 operator const std::string & () const
 
wxString wx_str () const
 
 operator wxString () const
 
std::string::const_iterator begin () const
 
std::string::const_iterator end () const
 
uni_iter ubegin () const
 Returns a uni_iter initialized to the start of "this" UTF8 byte sequence. More...
 
uni_iter uend () const
 Return a uni_iter initialized to the end of "this" UTF8 byte sequence. More...
 

Static Public Member Functions

static int uni_forward (const unsigned char *aSequence, unsigned *aResult=nullptr)
 Advance over a single UTF8 encoded multibyte character, capturing the Unicode character as it goes, and returning the number of bytes consumed. More...
 

Static Public Attributes

static constexpr std::string::size_type npos = -1
 

Protected Attributes

std::string m_s
 

Detailed Description

An 8 bit string that is assuredly encoded in UTF8, and supplies special conversion support to and from wxString, to and from std::string, and has non-mutating iteration over Unicode characters.

I've been careful to supply only conversion facilities and not try and duplicate wxString() with many member functions. There are multiple ways to create text into a std::string without the need of too many member functions:

Because this class uses no virtuals, it should be possible to cast any std::string into a UTF8 using this kind of cast: (UTF8 &) without construction or copying being the effect of the cast. Be sure the source std::string holds UTF8 encoded text before you do that.

Definition at line 70 of file utf8.h.

Constructor & Destructor Documentation

◆ UTF8() [1/5]

UTF8::UTF8 ( const wxString &  o)

Definition at line 40 of file utf8.cpp.

40 :
41 m_s( (const char*) o.utf8_str() )
42{
43}
std::string m_s
Definition: utf8.h:304

◆ UTF8() [2/5]

UTF8::UTF8 ( const char *  txt)
inline

This is a constructor for which you could end up with non-UTF8 encoding, but that would be your fault.

Definition at line 77 of file utf8.h.

77 :
78 m_s( txt )
79 {
81 }
const char * c_str() const
Definition: utf8.h:102
#define MAYBE_VERIFY_UTF8(x)
Definition: utf8.h:49

References c_str(), and MAYBE_VERIFY_UTF8.

◆ UTF8() [3/5]

UTF8::UTF8 ( const wchar_t *  txt)

For use with _() function on wx 2.8.

BTW _() on wx >= 2.9 returns wxString, not wchar_t* like on 2.8.

Definition at line 208 of file utf8.cpp.

209{
210 try
211 {
212 std::vector< char > temp( wcslen( txt ) * 4 + 1 );
213 wxConvUTF8.WC2MB( temp.data(), txt, temp.size() );
214 m_s.assign( temp.data() );
215 }
216 catch(...)
217 {
218 auto string = wxSafeConvertWX2MB( txt );
219 m_s.assign( string );
220 }
221
222 m_s.shrink_to_fit();
223}

References m_s.

◆ UTF8() [4/5]

UTF8::UTF8 ( const std::string &  o)
inline

Definition at line 87 of file utf8.h.

87 :
88 m_s( o )
89 {
91 }

References c_str(), and MAYBE_VERIFY_UTF8.

◆ UTF8() [5/5]

UTF8::UTF8 ( )
inline

Definition at line 93 of file utf8.h.

94 {
95 }

◆ ~UTF8()

UTF8::~UTF8 ( )
inline

Definition at line 97 of file utf8.h.

98 {
99 }

Member Function Documentation

◆ begin()

std::string::const_iterator UTF8::begin ( ) const
inline

Definition at line 192 of file utf8.h.

192{ return m_s.begin(); }

References m_s.

Referenced by LIB_TABLE::FormatOptions().

◆ c_str()

◆ clear()

void UTF8::clear ( )
inline

Definition at line 108 of file utf8.h.

108{ m_s.clear(); }

References m_s.

Referenced by LIB_ID::clear(), and RESCUE_CASE_CANDIDATE::FindRescues().

◆ compare()

int UTF8::compare ( const std::string &  s) const
inline

Definition at line 111 of file utf8.h.

111{ return m_s.compare( s ); }

References m_s.

Referenced by LIB_ID::compare(), and SCH_REFERENCE::CompareLibName().

◆ empty()

◆ end()

std::string::const_iterator UTF8::end ( ) const
inline

Definition at line 193 of file utf8.h.

193{ return m_s.end(); }

References m_s.

Referenced by LIB_TABLE::FormatOptions().

◆ find() [1/2]

std::string::size_type UTF8::find ( char  c) const
inline

Definition at line 105 of file utf8.h.

105{ return m_s.find( c ); }

References m_s.

Referenced by LIB_ID::Parse().

◆ find() [2/2]

std::string::size_type UTF8::find ( char  c,
size_t  s 
) const
inline

Definition at line 106 of file utf8.h.

106{ return m_s.find( c, s ); }

References m_s.

◆ find_first_of()

std::string::size_type UTF8::find_first_of ( const std::string &  str,
std::string::size_type  pos = 0 
) const
inline

Definition at line 117 of file utf8.h.

119 {
120 return m_s.find_first_of( str, pos );
121 }

References m_s.

Referenced by okLogical().

◆ length()

std::string::size_type UTF8::length ( ) const
inline

Definition at line 109 of file utf8.h.

109{ return m_s.length(); }

References m_s.

Referenced by PLACE_FILE_EXPORTER::GenPositionData().

◆ operator const std::string &()

UTF8::operator const std::string & ( ) const
inline

Definition at line 183 of file utf8.h.

183{ return m_s; }

References m_s.

◆ operator wxString()

UTF8::operator wxString ( ) const

Definition at line 52 of file utf8.cpp.

53{
54 return wxString( c_str(), wxConvUTF8 );
55}

◆ operator+=() [1/4]

UTF8 & UTF8::operator+= ( char  ch)
inline

Definition at line 130 of file utf8.h.

131 {
132 m_s.operator+=( ch );
134 return *this;
135 }

References c_str(), m_s, and MAYBE_VERIFY_UTF8.

◆ operator+=() [2/4]

UTF8 & UTF8::operator+= ( const char *  s)
inline

Definition at line 137 of file utf8.h.

138 {
139 m_s.operator+=( s );
141 return *this;
142 }

References c_str(), m_s, and MAYBE_VERIFY_UTF8.

◆ operator+=() [3/4]

UTF8 & UTF8::operator+= ( const UTF8 str)
inline

Definition at line 123 of file utf8.h.

124 {
125 m_s += str.m_s;
127 return *this;
128 }

References c_str(), m_s, and MAYBE_VERIFY_UTF8.

◆ operator+=() [4/4]

UTF8 & UTF8::operator+= ( unsigned  w_ch)

Append a wide (unicode) char to the UTF8 string.

if this wide char is not a ASCII7 char, it will be added as a UTF8 multibyte sequence

Parameters
w_chis a UTF-16 value (can be a UTF-32 on Linux)

Definition at line 226 of file utf8.cpp.

227{
228 if( w_ch <= 0x7F )
229 {
230 m_s.operator+=( char( w_ch ) );
231 }
232 else
233 {
234 //TODO: Remove wchar use. Replace with std::byte*
235 wchar_t wide_chr[2]; // buffer to store wide chars (UTF16) read from aText
236 wide_chr[1] = 0;
237 wide_chr[0] = w_ch;
238 UTF8 substr( wide_chr );
239 m_s += substr.m_s;
240 }
241
242 return *this;
243}
An 8 bit string that is assuredly encoded in UTF8, and supplies special conversion support to and fro...
Definition: utf8.h:71
std::string substr(size_t pos=0, size_t len=npos) const
Definition: utf8.h:178

References m_s, and substr().

◆ operator=() [1/4]

UTF8 & UTF8::operator= ( char  c)
inline

Definition at line 169 of file utf8.h.

170 {
171 m_s = c;
173 return *this;
174 }

References c_str(), m_s, and MAYBE_VERIFY_UTF8.

◆ operator=() [2/4]

UTF8 & UTF8::operator= ( const char *  s)
inline

Definition at line 162 of file utf8.h.

163 {
164 m_s = s;
166 return *this;
167 }

References c_str(), m_s, and MAYBE_VERIFY_UTF8.

◆ operator=() [3/4]

UTF8 & UTF8::operator= ( const std::string &  o)
inline

Definition at line 155 of file utf8.h.

156 {
157 m_s = o;
159 return *this;
160 }

References c_str(), m_s, and MAYBE_VERIFY_UTF8.

◆ operator=() [4/4]

UTF8 & UTF8::operator= ( const wxString &  o)

Definition at line 58 of file utf8.cpp.

59{
60 m_s = (const char*) o.utf8_str();
61 return *this;
62}

References m_s.

◆ operator==() [1/3]

bool UTF8::operator== ( const char *  s) const
inline

Definition at line 115 of file utf8.h.

115{ return m_s == s; }

References m_s.

◆ operator==() [2/3]

bool UTF8::operator== ( const std::string &  rhs) const
inline

Definition at line 114 of file utf8.h.

114{ return m_s == rhs; }

References m_s.

◆ operator==() [3/3]

bool UTF8::operator== ( const UTF8 rhs) const
inline

Definition at line 113 of file utf8.h.

113{ return m_s == rhs.m_s; }

References m_s.

◆ size()

std::string::size_type UTF8::size ( ) const
inline

Definition at line 110 of file utf8.h.

110{ return m_s.size(); }

References m_s.

Referenced by LIB_ID::Format(), LIB_TABLE::FormatOptions(), guessNickname(), PCB_EDIT_FRAME::LoadFootprints(), and setLibNickname().

◆ substr()

std::string UTF8::substr ( size_t  pos = 0,
size_t  len = npos 
) const
inline

Definition at line 178 of file utf8.h.

179 {
180 return m_s.substr( pos, len );
181 }

References m_s.

Referenced by operator+=(), and LIB_ID::Parse().

◆ ubegin()

uni_iter UTF8::ubegin ( ) const
inline

Returns a uni_iter initialized to the start of "this" UTF8 byte sequence.

Definition at line 279 of file utf8.h.

280 {
281 return uni_iter( m_s.data() );
282 }

References m_s.

Referenced by KIGFX::OPENGL_GAL::computeBitmapTextSize(), and LIB_ID::FixIllegalChars().

◆ uend()

uni_iter UTF8::uend ( ) const
inline

Return a uni_iter initialized to the end of "this" UTF8 byte sequence.

Definition at line 287 of file utf8.h.

288 {
289 return uni_iter( m_s.data() + m_s.size() );
290 }

References m_s.

Referenced by KIGFX::OPENGL_GAL::computeBitmapTextSize(), and LIB_ID::FixIllegalChars().

◆ uni_forward()

int UTF8::uni_forward ( const unsigned char *  aSequence,
unsigned *  aResult = nullptr 
)
static

Advance over a single UTF8 encoded multibyte character, capturing the Unicode character as it goes, and returning the number of bytes consumed.

Parameters
aSequenceis the UTF8 byte sequence, must be aligned on start of character.
aResultis where to put the unicode character, and may be NULL if no interest.
Returns
the count of bytes consumed.

Definition at line 68 of file utf8.cpp.

69{
70 unsigned ch = *aSequence;
71
72 if( ch < 0x80 )
73 {
74 if( aResult )
75 *aResult = ch;
76 return 1;
77 }
78
79 const unsigned char* s = aSequence;
80
81 static const unsigned char utf8_len[] = {
82 // Map encoded prefix byte to sequence length. Zero means
83 // illegal prefix. See RFC 3629 for details
84 /*
85 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 00-0F
86 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
87 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
88 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
89 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
90 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
91 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
92 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 70-7F
93 */
94 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 80-8F
95 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
96 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
97 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // B0-BF
98 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // C0-C1 + C2-CF
99 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // D0-DF
100 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // E0-EF
101 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 // F0-F4 + F5-FF
102 };
103
104 int len = utf8_len[ *s - 0x80 /* top half of table is missing */ ];
105
106 switch( len )
107 {
108 default:
109 case 0:
110 if( aResult )
111 wxFAIL_MSG( "uni_forward: invalid start byte" );
112
113 return 0;
114 break;
115
116 case 2:
117 if( ( s[1] & 0xc0 ) != 0x80 )
118 {
119 if( aResult )
120 wxFAIL_MSG( "uni_forward: invalid continuation byte" );
121
122 return 0;
123 }
124
125 ch = ((s[0] & 0x1f) << 6) +
126 ((s[1] & 0x3f) << 0);
127
128 // assert( ch > 0x007F && ch <= 0x07FF );
129 break;
130
131 case 3:
132 if( (s[1] & 0xc0) != 0x80 ||
133 (s[2] & 0xc0) != 0x80 ||
134 (s[0] == 0xE0 && s[1] < 0xA0)
135 // || (s[0] == 0xED && s[1] > 0x9F)
136 )
137 {
138 if( aResult )
139 wxFAIL_MSG( "uni_forward: invalid continuation byte" );
140
141 return 0;
142 }
143
144 ch = ((s[0] & 0x0f) << 12) +
145 ((s[1] & 0x3f) << 6 ) +
146 ((s[2] & 0x3f) << 0 );
147
148 // assert( ch > 0x07FF && ch <= 0xFFFF );
149 break;
150
151 case 4:
152 if( (s[1] & 0xc0) != 0x80 ||
153 (s[2] & 0xc0) != 0x80 ||
154 (s[3] & 0xc0) != 0x80 ||
155 (s[0] == 0xF0 && s[1] < 0x90) ||
156 (s[0] == 0xF4 && s[1] > 0x8F) )
157 {
158 if( aResult )
159 wxFAIL_MSG( "uni_forward: invalid continuation byte" );
160
161 return 0;
162 }
163
164 ch = ((s[0] & 0x7) << 18) +
165 ((s[1] & 0x3f) << 12) +
166 ((s[2] & 0x3f) << 6 ) +
167 ((s[3] & 0x3f) << 0 );
168
169 // assert( ch > 0xFFFF && ch <= 0x10ffff );
170 break;
171 }
172
173 if( aResult )
174 *aResult = ch;
175
176 return len;
177}

Referenced by IsUTF8(), UTF8::uni_iter::operator*(), UTF8::uni_iter::operator++(), and UTF8::uni_iter::operator->().

◆ wx_str()

Member Data Documentation

◆ m_s

◆ npos

constexpr std::string::size_type UTF8::npos = -1
staticconstexpr

Definition at line 151 of file utf8.h.

Referenced by LIB_ID::Parse().


The documentation for this class was generated from the following files: