Browse Source

Add support for looking up the general category group for a codepoint.

master
Reece H. Dunn 12 years ago
parent
commit
ff7a5e0209
3 changed files with 69 additions and 18 deletions
  1. 21
    0
      src/categories.cpp
  2. 25
    18
      src/include/ucd/ucd.h
  3. 23
    0
      tools/categories.py

+ 21
- 0
src/categories.cpp View File

@@ -3280,3 +3280,24 @@ ucd::category ucd::lookup_category(codepoint_t c)
if (c <= 0x10FFFF) return Cn; // 10FFFE..10FFFF : Plane 16 Private Use
return Ci;
}

ucd::category_group ucd::lookup_category_group(codepoint_t c)
{
switch (lookup_category(c))
{
case Cc: case Cf: case Ci: case Cn: case Co: case Cs: case Zc:
return C;
case Ll: case Lm: case Lo: case Lt: case Lu:
return L;
case Mc: case Me: case Mn:
return M;
case Nd: case Nl: case No:
return N;
case Pc: case Pd: case Pe: case Pf: case Pi: case Po: case Ps:
return P;
case Sc: case Sk: case Sm: case So:
return S;
case Zl: case Zp: case Zs:
return Z;
}
}

+ 25
- 18
src/include/ucd/ucd.h View File

@@ -31,13 +31,25 @@ namespace ucd
*/
typedef uint32_t codepoint_t;

/** @brief Unicode General Category Groups
* @see http://www.unicode.org/reports/tr44/
*/
enum category_group
{
C, /**< @brief Other */
L, /**< @brief Letter */
M, /**< @brief Mark */
N, /**< @brief Number */
P, /**< @brief Punctuation */
S, /**< @brief Symbol */
Z, /**< @brief Separator */
};

/** @brief Unicode General Category Values
* @see http://www.unicode.org/reports/tr44/
*/
enum category
{
// Other

Cc, /**< @brief Control Character */
Cf, /**< @brief Format Control Character */
Ci, /**< @brief Invalid Unicode Character */
@@ -45,35 +57,20 @@ namespace ucd
Co, /**< @brief Private Use */
Cs, /**< @brief Surrogate Code Point */

// Letter

Ll, /**< @brief Lower Case Letter */
Lm, /**< @brief Letter Modifier */
Lo, /**< @brief Other Letter */
Lt, /**< @brief Title Case Letter */
Lu, /**< @brief Upper Case Letter */

// Mark

Mc, /**< @brief Spacing Mark */
Me, /**< @brief Enclosing Mark */
Mn, /**< @brief Non-Spacing Mark */

// Symbol

Sc, /**< @brief Currency Symbol */
Sk, /**< @brief Modifier Symbol */
Sm, /**< @brief Math Symbol */
So, /**< @brief Other Symbol */

// Number

Nd, /**< @brief Decimal Digit */
Nl, /**< @brief Letter-Like Number */
No, /**< @brief Other Number */

// Punctuation

Pc, /**< @brief Connector */
Pd, /**< @brief Dash/Hyphen */
Pe, /**< @brief Close Punctuation Mark */
@@ -82,7 +79,10 @@ namespace ucd
Po, /**< @brief Other */
Ps, /**< @brief Open PUnctuation Mark */

// Separator
Sc, /**< @brief Currency Symbol */
Sk, /**< @brief Modifier Symbol */
Sm, /**< @brief Math Symbol */
So, /**< @brief Other Symbol */

Zc, /**< @brief Whitespace character in the Cc category */
Zl, /**< @brief Line Separator */
@@ -90,6 +90,13 @@ namespace ucd
Zs, /**< @brief Space Separator */
};

/** @brief Lookup the General Category Group for a Unicode codepoint.
*
* @param c The Unicode codepoint to lookup.
* @return The General Category Group of the Unicode codepoint.
*/
category_group lookup_category_group(codepoint_t c);

/** @brief Lookup the General Category for a Unicode codepoint.
*
* @param c The Unicode codepoint to lookup.

+ 23
- 0
tools/categories.py View File

@@ -153,3 +153,26 @@ using namespace ucd;
sys.stdout.write('\t}\n')
sys.stdout.write('\treturn Ci;\n')
sys.stdout.write('}\n')

sys.stdout.write("""
ucd::category_group ucd::lookup_category_group(codepoint_t c)
{
switch (lookup_category(c))
{
case Cc: case Cf: case Ci: case Cn: case Co: case Cs: case Zc:
return C;
case Ll: case Lm: case Lo: case Lt: case Lu:
return L;
case Mc: case Me: case Mn:
return M;
case Nd: case Nl: case No:
return N;
case Pc: case Pd: case Pe: case Pf: case Pi: case Po: case Ps:
return P;
case Sc: case Sk: case Sm: case So:
return S;
case Zl: case Zp: case Zs:
return Z;
}
}
""")

Loading…
Cancel
Save