package Unicode::UCD; use strict; use warnings; our $VERSION = '0.2'; require Exporter; our @ISA = qw(Exporter); our @EXPORT_OK = qw(charinfo charblock charscript charblocks charscripts charinrange compexcl casefold casespec); use Carp; =head1 NAME Unicode::UCD - Unicode character database =head1 SYNOPSIS use Unicode::UCD 'charinfo'; my $charinfo = charinfo($codepoint); use Unicode::UCD 'charblock'; my $charblock = charblock($codepoint); use Unicode::UCD 'charscript'; my $charscript = charblock($codepoint); use Unicode::UCD 'charblocks'; my $charblocks = charblocks(); use Unicode::UCD 'charscripts'; my %charscripts = charscripts(); use Unicode::UCD qw(charscript charinrange); my $range = charscript($script); print "looks like $script\n" if charinrange($range, $codepoint); use Unicode::UCD 'compexcl'; my $compexcl = compexcl($codepoint); my $unicode_version = Unicode::UCD::UnicodeVersion(); =head1 DESCRIPTION The Unicode::UCD module offers a simple interface to the Unicode Character Database. =cut my $UNICODEFH; my $BLOCKSFH; my $SCRIPTSFH; my $VERSIONFH; my $COMPEXCLFH; my $CASEFOLDFH; my $CASESPECFH; sub openunicode { my ($rfh, @path) = @_; my $f; unless (defined $$rfh) { for my $d (@INC) { use File::Spec; $f = File::Spec->catfile($d, "unicore", @path); last if open($$rfh, $f); undef $f; } croak __PACKAGE__, ": failed to find ", File::Spec->catfile(@path), " in @INC" unless defined $f; } return $f; } =head2 charinfo use Unicode::UCD 'charinfo'; my $charinfo = charinfo(0x41); charinfo() returns a reference to a hash that has the following fields as defined by the Unicode standard: key code code point with at least four hexdigits name name of the character IN UPPER CASE category general category of the character combining classes used in the Canonical Ordering Algorithm bidi bidirectional category decomposition character decomposition mapping decimal if decimal digit this is the integer numeric value digit if digit this is the numeric value numeric if numeric is the integer or rational numeric value mirrored if mirrored in bidirectional text unicode10 Unicode 1.0 name if existed and different comment ISO 10646 comment field upper uppercase equivalent mapping lower lowercase equivalent mapping title titlecase equivalent mapping block block the character belongs to (used in \p{In...}) script script the character belongs to If no match is found, a reference to an empty hash is returned. The C property is the same as returned by charinfo(). It is not defined in the Unicode Character Database proper (Chapter 4 of the Unicode 3.0 Standard, aka TUS3) but instead in an auxiliary database (Chapter 14 of TUS3). Similarly for the C