#include <UnicodeClass.h>
Public Types
enum CharacterType {
  kCharacterType_Unknown = 0, kCharacterType_DoubleByte = 0x0001, kCharacterType_Roman = 0x0002, kCharacterType_Upper = 0x0004,
  kCharacterType_Numeric = 0x0008, kCharacterType_Hiragana = 0x0010, kCharacterType_Katakana = 0x0020, kCharacterType_SmallKana = 0x0040,
  kCharacterType_Nobashi = 0x0080, kCharacterType_Kanji = 0x0100, kCharacterType_Symbol = 0x0200, kCharacterType_OpenParenthesis = 0x0400,
  kCharacterType_CloseParenthesis = 0x0800, kCharacterType_Period = 0x1000, kCharacterType_Comma = 0x2000, kCharacterType_MiddlePunc = 0x4000,
  kCharacterType_Other = 0x8000
}

enum IgnoreCharacterDetails {
  kIgnoreZeroWidthOnly = 0, kIgnoreDiscretionaryHyphens = 0x01, kIgnoreCalculatedText = 0x02, kIgnoreTableCharacters = 0x04,
  kIgnoreInlineGraphics = 0x08, kIgnoreNewLine = 0x010, kIgnoreSpecialGlyph = 0x020, kIgnoreUnicodeVariation = 0x040,
  kIgnoreNonLegal = kIgnoreTableCharacters + kIgnoreInlineGraphics + kIgnoreNewLine, kIgnoreSpellingIgnorable = 0x07F
}

Static Public Member Functions
static bool IsLetter (const UTF32TextChar &c)

static bool IsCJKLetter (const UTF32TextChar &c)

static bool IsNonCJKLetter (const UTF32TextChar &c)

static bool IsHiragana (const UTF32TextChar &c)

static bool IsKatakana (const UTF32TextChar &c)

static bool IsCJKIdeograph (const UTF32TextChar &c)

static bool IsBopomofo (const UTF32TextChar &c)

static bool IsJamo (const UTF32TextChar &c)

static bool IsHangul (const UTF32TextChar &c)

static bool IsRomanDigit (const UTF32TextChar &c)

static bool IsJapaneseNumber (const UTF32TextChar &c)

static bool IsAnyNumber (const UTF32TextChar &c)

static bool IsWhiteSpace (const UTF32TextChar &c)

static bool IsCombiningMark (const UTF32TextChar &c)

static bool IsDiacritic_WorldReady (const UTF32TextChar &c)

static bool IsPunctuation (const UTF32TextChar &c)

static bool IsPunctuationDash (const UTF32TextChar &c)

static bool IsPunctuationOpen (const UTF32TextChar &c)

static bool IsPunctuationClose (const UTF32TextChar &c)

static bool IsPunctuationInitialQuote (const UTF32TextChar &c)

static bool IsPunctuationFinalQuote (const UTF32TextChar &c)

static bool IsMidWordPunctuation (const UTF32TextChar &c)

static bool IsSymbol (const UTF32TextChar &c)

static bool IsSymbolMath (const UTF32TextChar &c)

static bool IsSymbolCurrency (const UTF32TextChar &c)

static bool IsGreek (const UTF32TextChar &c)

static bool IsCyrillic (const UTF32TextChar &c)

static bool IsThai (const UTF32TextChar &c)

static int GetLocale (const UTF32TextChar &c)

static UTF32TextChar ToUpper (const UTF32TextChar &c)

static bool IsUppercase (const UTF32TextChar &c)

static bool IsUpper (const UTF32TextChar &c)

static bool CanChangeToUppercase (const UTF32TextChar &c)

static UTF32TextChar ToLower (const UTF32TextChar &c)

static bool IsLowercase (const UTF32TextChar &c)

static bool IsLower (const UTF32TextChar &c)

static bool CanChangeToLowercase (const UTF32TextChar &c)

static UTF32TextChar ToTitle (const UTF32TextChar &c)

static bool IsTitlecase (const UTF32TextChar &c)

static bool CanChangeToTitlecase (const UTF32TextChar &c)

static bool StartsUppercase (const UTF32TextChar &c)

static bool IsCJKFullWidth (const UTF32TextChar &c)

static UTF32TextChar ToFullWidthVariant (const UTF32TextChar &c)

static bool IsNarrowVariant (const UTF32TextChar &c)

static UTF32TextChar ToNarrowVariant (const UTF32TextChar &c)

static UTF32TextChar ToFirstBaseChar (const UTF32TextChar &c)

static UTF32TextChar ToUltimateBaseChar (const UTF32TextChar &c)

static CharacterType GetCharacterType (const UTF32TextChar &c)

static bool IsHighSurrogate (UTF16TextChar c)

static bool IsLowSurrogate (UTF16TextChar c)

static bool IsSurrogate (UTF16TextChar c)

static bool IsVariationSelector (const UTF32TextChar &c)

static bool IsBasicLatin (const UTF32TextChar &c)

static bool IsLatin1 (const UTF32TextChar &c)

static bool IsLatinExtendedA (const UTF32TextChar &c)

static bool IsLatinExtendedB (const UTF32TextChar &c)

static bool IsSuperscriptOrSubscript (const UTF32TextChar &c)

static bool IsIgnoredCharacter (const UTF32TextChar &n, IgnoreCharacterDetails ignoreDischy=kIgnoreDiscretionaryHyphens)

static bool IsHebrewLetter (const UTF32TextChar &c)

static bool IsArabicLetter (const UTF32TextChar &c)

Detailed Description

UnicodeClass is used for classification of Unicode characters used in InDesign. It is primarily a wrapper around the ICU library.

See Also: IGlyphUtils.h for GetUnicodeName.

Member Enumeration Documentation

enum UnicodeClass::CharacterType

Character Type. Bit-field used to classify characters into variable classes for processing.

enum UnicodeClass::IgnoreCharacterDetails

IgnoreCharacterDetails. What type of characters should be "ignored" by IsIgnoredCharacter.
Enumerator
kIgnoreDiscretionaryHyphens
zero-width stuff, break run-in, indent-here
kIgnoreCalculatedText
discretionary hyphens
kIgnoreTableCharacters
page number, section name, footnote
kIgnoreInlineGraphics
table, table continued
kIgnoreNewLine
inline graphic
kIgnoreSpecialGlyph
CR, LF
kIgnoreUnicodeVariation
roman & non-roman
kIgnoreNonLegal
unicode variation sequence characters

Member Function Documentation

static CharacterType UnicodeClass::GetCharacterType ( const UTF32TextChar & c )

static

Get character type.

Returns: a classification of the unicode character c.

static int UnicodeClass::GetLocale ( const UTF32TextChar & c )

static

Locale.

Returns: int corresponding to the locale to which the character belongs.

static bool UnicodeClass::IsAnyNumber ( const UTF32TextChar & c )

static

Any numbers.

Returns: true if unicode character c is a number in any script.

static bool UnicodeClass::IsBasicLatin ( const UTF32TextChar & c )

inlinestatic

Basic Latin (low ascii). @ return true if unicode character c is low ascii

static bool UnicodeClass::IsBopomofo ( const UTF32TextChar & c )

static

Chinese bopomofo.

Returns: true if unicode character c is Chinese bopomofo.

static bool UnicodeClass::IsCJKFullWidth ( const UTF32TextChar & c )

static

Full width (CJK). @ return true if unicode character c is full width (1 em-box).

static bool UnicodeClass::IsCJKIdeograph ( const UTF32TextChar & c )

static

CJK unified ideographs.

Returns: true if unicode character c is a CJK unified ideographs.

static bool UnicodeClass::IsCJKLetter ( const UTF32TextChar & c )

static

CJK Letters. All CJK letters: ideograph, kana, hangul, half-width kana, etc.

Returns: true if unicode character c is a CJK letter.

static bool UnicodeClass::IsCombiningMark ( const UTF32TextChar & c )

static

Combining marks.

Returns: true if unicode character c is any kind of combining mark (mostly diacritics).

static bool UnicodeClass::IsCyrillic ( const UTF32TextChar & c )

static

Cyrillic.

Returns: true if unicode character c is a Cyrillic character.

static bool UnicodeClass::IsDiacritic_WorldReady ( const UTF32TextChar & c )

static

Diacritics.

Returns: true if the unicode character c is a diacritic according to WRUDGetCharacterProperty.

static bool UnicodeClass::IsGreek ( const UTF32TextChar & c )

static

Greek.

Returns: true if unicode character c is a Greek character.

static bool UnicodeClass::IsHangul ( const UTF32TextChar & c )

static

Korean hangul.

Returns: true if unicode character c is Korean hangul.

static bool UnicodeClass::IsHighSurrogate ( UTF16TextChar c )

inlinestatic

High surrogate.

Returns: true if the UTF16 character c is a high surrogate.

static bool UnicodeClass::IsHiragana ( const UTF32TextChar & c )

static

Japanese Hiragana.

Returns: true if unicode character c is hiragana.

static bool UnicodeClass::IsIgnoredCharacter	(	const UTF32TextChar &	n,
		IgnoreCharacterDetails	ignoreDischy = `kIgnoreDiscretionaryHyphens`
	)

static

The set of "characters" that occupy spots in the model that should be treated neither as whitespace nor as an actual character. These include markers in the text that the user would not consider as something that they had entered into the text. This set does NOT include inline graphic markers or table markers, which always take up a position in the visible text

static bool UnicodeClass::IsJamo ( const UTF32TextChar & c )

static

Korean jamo.

Returns: true if unicode character c is Korean jamo.

static bool UnicodeClass::IsJapaneseNumber ( const UTF32TextChar & c )

static

Japanese Kanji numbers.

Returns: true if unicode character c is a Japanese Kanji numeral.

static bool UnicodeClass::IsKatakana ( const UTF32TextChar & c )

static

Japanese Katakana.

Returns: true if unicode character c is Katakana.

static bool UnicodeClass::IsLatin1 ( const UTF32TextChar & c )

inlinestatic

Latin 1. @ return true if unicode character c is in the Latin 1 unicode group.

static bool UnicodeClass::IsLatinExtendedA ( const UTF32TextChar & c )

inlinestatic

Latin extended A. @ return true if unicode character c is in the Latin extended A unicode group.

static bool UnicodeClass::IsLatinExtendedB ( const UTF32TextChar & c )

inlinestatic

Latin extended B. @ return true if unicode character c is in the Latin extended B unicode group.

static bool UnicodeClass::IsLetter ( const UTF32TextChar & c )

static

Letters. Not symbols or punctuation, only letters, but ALL letters: ideograph, kana, latin, greek, devanagari, etc.

Returns: true if unicode character c is a letter.

static bool UnicodeClass::IsLowercase ( const UTF32TextChar & c )

static

Is lowercase.

Returns: true if the unicode character c is lowercase.

static bool UnicodeClass::IsLowSurrogate ( UTF16TextChar c )

inlinestatic

Low surrogate.

Returns: true if the UTF16 character c is a low surrogate.

static bool UnicodeClass::IsMidWordPunctuation ( const UTF32TextChar & c )

static

Mid-word punctuation. Used for word counting mostly.

Returns: true if unicode character c is any kind of mid-word punctuation.

static bool UnicodeClass::IsNarrowVariant ( const UTF32TextChar & c )

inlinestatic

Is narrow variant. NOT the same as !ISCJKFullWidth().

Returns: true if the unicode character c is the narrow variant as another character.

static bool UnicodeClass::IsNonCJKLetter ( const UTF32TextChar & c )

static

Letters, but not CJK letters. E.g. latin, greek, hebrew, etc.

Returns: true if unicode character c is a non-CJK letter.

static bool UnicodeClass::IsPunctuation ( const UTF32TextChar & c )

static

Punctuation.

Returns: true if unicode character c is any kind of punctuation.

static bool UnicodeClass::IsPunctuationClose ( const UTF32TextChar & c )

static

Closing punctuation.

Returns: true if unicode character c is any kind of closing punctuation mark.

static bool UnicodeClass::IsPunctuationDash ( const UTF32TextChar & c )

static

Punctuation dash.

Returns: true if unicode character c is any kind of dash

static bool UnicodeClass::IsPunctuationFinalQuote ( const UTF32TextChar & c )

static

Final quotation mark.

Returns: true if unicode character c is any kind of final quotation punctuation.

static bool UnicodeClass::IsPunctuationInitialQuote ( const UTF32TextChar & c )

static

Initial quotation mark.

Returns: true if unicode character c is any kind of initial quotation punctuation.

static bool UnicodeClass::IsPunctuationOpen ( const UTF32TextChar & c )

static

Open punctuation.

Returns: true if unicode character c is any kind of opening punctuation mark.

static bool UnicodeClass::IsRomanDigit ( const UTF32TextChar & c )

static

Numbers 0-9.

Returns: true if unicode character c is 0-9.

static bool UnicodeClass::IsSuperscriptOrSubscript ( const UTF32TextChar & c )

inlinestatic

Superscript, subscript @ return true if unicode character c is a unicode superscript or subscript.

static bool UnicodeClass::IsSurrogate ( UTF16TextChar c )

inlinestatic

Is surrogate.

Returns: true if the UTF16 character c is a high or low surrogate.

static bool UnicodeClass::IsSymbol ( const UTF32TextChar & c )

static

Symbol.

Returns: true if unicode character c is any kind symbol character.

static bool UnicodeClass::IsSymbolCurrency ( const UTF32TextChar & c )

static

Currency symbol.

Returns: true if unicode character c is any kind of currency symbol.

static bool UnicodeClass::IsSymbolMath ( const UTF32TextChar & c )

static

Math symbol.

Returns: true if unicode character c is any kind of math symbol

static bool UnicodeClass::IsThai ( const UTF32TextChar & c )

static

Thai.

Returns: true if unicode character c is a Thai character.

static bool UnicodeClass::IsTitlecase ( const UTF32TextChar & c )

static

Is title case. For unicode characters that have the idea of 2 glyphs (u1CB = Nj, u1F2 = Dz, etc).

Returns: whether the unicode character c is a 2-letter title-case unicode character.

static bool UnicodeClass::IsUppercase ( const UTF32TextChar & c )

static

Is uppercase.

Returns: true if the unicode character c is uppercase.

static bool UnicodeClass::IsWhiteSpace ( const UTF32TextChar & c )

static

White Space.

Returns: true if unicode character c is white space: spaces, tabs, etc.

static bool UnicodeClass::StartsUppercase ( const UTF32TextChar & c )

static

Starts uppercase.

Returns: true if the unicode character c is uppercase or title case.

static UTF32TextChar UnicodeClass::ToFirstBaseChar ( const UTF32TextChar & c )

static

To first base chararacter.

Returns: the unicode character the results from stripping off the lowest priority diacritical mark.

static UTF32TextChar UnicodeClass::ToFullWidthVariant ( const UTF32TextChar & c )

static

To full width (CJK). @ return the full-width variant (if one exists) of the unicode character c.

static UTF32TextChar UnicodeClass::ToLower ( const UTF32TextChar & c )

static

Lowercase.

Returns: the lowercase variant of the unicode character c.

static UTF32TextChar UnicodeClass::ToNarrowVariant ( const UTF32TextChar & c )

static

To narrow variant.

Returns: the narrow variant of the unicode character c.

static UTF32TextChar UnicodeClass::ToTitle ( const UTF32TextChar & c )

static

Title case. For unicode characters that have the idea of 2 glyphs (u1CB = Nj, u1F2 = Dz, etc). NOTE: for lowercase letters in general, ToTitle does NOT capitalize them.

Returns: the title-case variant of 2-letter unicode character c.

static UTF32TextChar UnicodeClass::ToUltimateBaseChar ( const UTF32TextChar & c )

static

To ultimate base chararacter.

Returns: the unicode character the results from stripping off all diacritical marks.

static UTF32TextChar UnicodeClass::ToUpper ( const UTF32TextChar & c )

static

Uppercase.

Returns: the uppercase variant of the unicode character c.

Public Types
enum	CharacterType { kCharacterType_Unknown = 0, kCharacterType_DoubleByte = 0x0001, kCharacterType_Roman = 0x0002, kCharacterType_Upper = 0x0004, kCharacterType_Numeric = 0x0008, kCharacterType_Hiragana = 0x0010, kCharacterType_Katakana = 0x0020, kCharacterType_SmallKana = 0x0040, kCharacterType_Nobashi = 0x0080, kCharacterType_Kanji = 0x0100, kCharacterType_Symbol = 0x0200, kCharacterType_OpenParenthesis = 0x0400, kCharacterType_CloseParenthesis = 0x0800, kCharacterType_Period = 0x1000, kCharacterType_Comma = 0x2000, kCharacterType_MiddlePunc = 0x4000, kCharacterType_Other = 0x8000 }

enum	IgnoreCharacterDetails { kIgnoreZeroWidthOnly = 0, kIgnoreDiscretionaryHyphens = 0x01, kIgnoreCalculatedText = 0x02, kIgnoreTableCharacters = 0x04, kIgnoreInlineGraphics = 0x08, kIgnoreNewLine = 0x010, kIgnoreSpecialGlyph = 0x020, kIgnoreUnicodeVariation = 0x040, kIgnoreNonLegal = kIgnoreTableCharacters + kIgnoreInlineGraphics + kIgnoreNewLine, kIgnoreSpellingIgnorable = 0x07F }

Static Public Member Functions
static bool	IsLetter (const UTF32TextChar &c)

static bool	IsCJKLetter (const UTF32TextChar &c)

static bool	IsNonCJKLetter (const UTF32TextChar &c)

static bool	IsHiragana (const UTF32TextChar &c)

static bool	IsKatakana (const UTF32TextChar &c)

static bool	IsCJKIdeograph (const UTF32TextChar &c)

static bool	IsBopomofo (const UTF32TextChar &c)

static bool	IsJamo (const UTF32TextChar &c)

static bool	IsHangul (const UTF32TextChar &c)

static bool	IsRomanDigit (const UTF32TextChar &c)

static bool	IsJapaneseNumber (const UTF32TextChar &c)

static bool	IsAnyNumber (const UTF32TextChar &c)

static bool	IsWhiteSpace (const UTF32TextChar &c)

static bool	IsCombiningMark (const UTF32TextChar &c)

static bool	IsDiacritic_WorldReady (const UTF32TextChar &c)

static bool	IsPunctuation (const UTF32TextChar &c)

static bool	IsPunctuationDash (const UTF32TextChar &c)

static bool	IsPunctuationOpen (const UTF32TextChar &c)

static bool	IsPunctuationClose (const UTF32TextChar &c)

static bool	IsPunctuationInitialQuote (const UTF32TextChar &c)

static bool	IsPunctuationFinalQuote (const UTF32TextChar &c)

static bool	IsMidWordPunctuation (const UTF32TextChar &c)

static bool	IsSymbol (const UTF32TextChar &c)

static bool	IsSymbolMath (const UTF32TextChar &c)

static bool	IsSymbolCurrency (const UTF32TextChar &c)

static bool	IsGreek (const UTF32TextChar &c)

static bool	IsCyrillic (const UTF32TextChar &c)

static bool	IsThai (const UTF32TextChar &c)

static int	GetLocale (const UTF32TextChar &c)

static UTF32TextChar	ToUpper (const UTF32TextChar &c)

static bool	IsUppercase (const UTF32TextChar &c)

static bool	IsUpper (const UTF32TextChar &c)

static bool	CanChangeToUppercase (const UTF32TextChar &c)

static UTF32TextChar	ToLower (const UTF32TextChar &c)

static bool	IsLowercase (const UTF32TextChar &c)

static bool	IsLower (const UTF32TextChar &c)

static bool	CanChangeToLowercase (const UTF32TextChar &c)

static UTF32TextChar	ToTitle (const UTF32TextChar &c)

static bool	IsTitlecase (const UTF32TextChar &c)

static bool	CanChangeToTitlecase (const UTF32TextChar &c)

static bool	StartsUppercase (const UTF32TextChar &c)

static bool	IsCJKFullWidth (const UTF32TextChar &c)

static UTF32TextChar	ToFullWidthVariant (const UTF32TextChar &c)

static bool	IsNarrowVariant (const UTF32TextChar &c)

static UTF32TextChar	ToNarrowVariant (const UTF32TextChar &c)

static UTF32TextChar	ToFirstBaseChar (const UTF32TextChar &c)

static UTF32TextChar	ToUltimateBaseChar (const UTF32TextChar &c)

static CharacterType	GetCharacterType (const UTF32TextChar &c)

static bool	IsHighSurrogate (UTF16TextChar c)

static bool	IsLowSurrogate (UTF16TextChar c)

static bool	IsSurrogate (UTF16TextChar c)

static bool	IsVariationSelector (const UTF32TextChar &c)

static bool	IsBasicLatin (const UTF32TextChar &c)

static bool	IsLatin1 (const UTF32TextChar &c)

static bool	IsLatinExtendedA (const UTF32TextChar &c)

static bool	IsLatinExtendedB (const UTF32TextChar &c)

static bool	IsSuperscriptOrSubscript (const UTF32TextChar &c)

static bool	IsIgnoredCharacter (const UTF32TextChar &n, IgnoreCharacterDetails ignoreDischy=kIgnoreDiscretionaryHyphens)

static bool	IsHebrewLetter (const UTF32TextChar &c)

static bool	IsArabicLetter (const UTF32TextChar &c)

Enumerator
kIgnoreDiscretionaryHyphens	zero-width stuff, break run-in, indent-here
kIgnoreCalculatedText	discretionary hyphens
kIgnoreTableCharacters	page number, section name, footnote
kIgnoreInlineGraphics	table, table continued
kIgnoreNewLine	inline graphic
kIgnoreSpecialGlyph	CR, LF
kIgnoreUnicodeVariation	roman & non-roman
kIgnoreNonLegal	unicode variation sequence characters