Edit file File name : umap Content :#!/usr/bin/perl -w =head1 NAME umap - map between different character sets =head1 SYNOPSIS umap [options] <before>:<after> =head1 DESCRIPTION The I<umap> script acts as a filter between different encodings and character sets. The following options are recognized: =over 4 =item --list [charset] Without argument list all character sets recognized. With a specified character set list the mapping between this set and Unicode. =item --strict Do the stict mapping between the character sets. The default is to not translate unmapped character. With I<--stict> we will remove unmapped characters or use the default specified with I<--def8> or I<--def16>. =item --def8=<charcode> Set the default 8-bit code for unmapped chars. =item --def16=<charcode> Set the default 16-bit code for unmapped chars. =item --verbose Generate more verbose output. =item --version Print the version number of this program and quit. =item --help Print the usage message. =back =head1 SEE ALSO L<Unicode::String>, L<Unicode::Map8>, recode(1) =head1 COPYRIGHT Copyright 1998 Gisle Aas. This is free software; you can redistribute it and/or modify it under the same terms as Perl itself. =cut use strict; use Getopt::Long qw(GetOptions); my $VERSION = "1.05"; my $list; my $strict; my $verbose; my $def8; my $def16; my $before; my $after; GetOptions('version' => \&print_version, 'help' => \&usage, 'list:s' => \$list, 'verbose' => \$verbose, 'strict!' => \$strict, 'def8=i' => \$def8, 'def16=i' => \$def16, ) || usage (); if (defined $list) { if (length($list)) { list_charset($list); } else { list_charsets(); } exit; } # Try to extract $before/$after from the remaining arguments $before = shift || $ENV{UMAP_BEFORE} || "latin1"; if (!@ARGV && $before =~ s/([^\\]):/$1\0/) { ($before, $after) = split('\0', $before, 2); } unless ($after) { $after = shift || $ENV{UMAP_AFTER} || "utf8"; } for ($before, $after) { s/\\:/:/g; } usage() if @ARGV; print STDERR "$before --> $after\n" if $verbose; #------------------------------------------------------------------ package MySpace; # use a new namespace use Unicode::String 2.00 qw(ucs4 ucs2 utf16 utf7 utf8); my $bsub = \&{$before}; unless (defined(&$bsub)) { require Unicode::Map8; my $map = Unicode::Map8->new($before); die "Don't know about charset '$before'\n" unless $map; $map->nostrict unless $strict; $map->default_to16($def16) if defined($def16); no strict 'refs'; *{$before} = sub { $map->tou($_[0]); }; } if ($after =~ /^(ucs[24]|utf16|utf[78])$/) { *out = sub { print $_[0]->$after(); }; } elsif ($after eq "hex") { *out = sub { my $hex = $_[0]->hex; $hex =~ s/U\+000a\s*/U+000a\n/g; print $hex; }; } elsif ($after eq "uname") { require Unicode::CharName; *out = sub { for ($_[0]->unpack) { printf "U+%04X %s\n", $_, Unicode::CharName::uname($_) || ""; } }; } else { require Unicode::Map8; my $map = Unicode::Map8->new($after); die "Don't know about charset '$after'\n" unless $map; $map->nostrict unless $strict; $map->default_to8($def8) if defined($def8); #*out = sub { print $map->to8(${$_[0]}); }; *out = sub { print $map->to8(${$_[0]}); }; } if (-t STDIN || $before =~ /^utf[78]$/) { # must read a line at the time (should not break encoded chars) my $line; while (defined($line = <STDIN>)) { out(&$bsub($line)); } } else { my $n; my $buf; # must read buffers which are multiples of 4 bytes (ucs4) while ( $n = read(STDIN, $buf, 512)) { #print "$n bytes read\n"; out(&$bsub($buf)); } } #------------------------------------------------------------------ package main; sub list_charset { require Unicode::Map8; require Unicode::CharName; my($charset, $format) = @_; my $m = Unicode::Map8->new($charset); die "Don't know about charset $charset\n" unless $m; my @res8; my %map16; for (my $i = 0; $i < 256; $i++) { my $u = $m->to_char16($i); if ($u == Unicode::Map8::NOCHAR()) { push(@res8, sprintf "# 0x%02X unmapped\n", $i) if $verbose; } else { push(@res8, sprintf "0x%02X 0x%04X # %s\n", $i, $u, Unicode::CharName::uname($u)); $map16{$u} = $i; } } my @res16; my @blocks; for (my $block = 0; $block < 256; $block++) { next if $m->_empty_block($block); push(@blocks, $block); for (my $i = 0; $i < 256; $i++) { my $u = $block*256 + $i; my $c = $m->to_char8($u); next if $c == Unicode::Map8::NOCHAR(); next if exists $map16{$u} && $map16{$u} == $c; push(@res16, sprintf "0x%02X 0x%04X # %s\n", $c, $u, Unicode::CharName::uname($u)); } } print "# Mapping for '$charset'\n"; print "#\n"; printf "# %d allocated blocks", scalar(@blocks); if (@blocks > 1 || $blocks[0] != 0) { print " (", join(", ", map "#".($_+1), @blocks), ")"; } print "\n"; print "#\n"; print @res8; if (@res16) { print "\n# Extra 16-bit to 8-bit mappings\n"; print @res16; } } sub list_charsets { require Unicode::Map8; my %set = ( ucs4 => {}, ucs2 => {utf16 => 1}, utf7 => {}, utf8 => {}, ); if (opendir(DIR, $Unicode::Map8::MAPS_DIR)) { my $f; while (defined($f = readdir(DIR))) { next unless -f "$Unicode::Map8::MAPS_DIR/$f"; $f =~ s/\.(?:bin|txt)$//; $set{$f} = {} if Unicode::Map8->new($f); } } my $avoid_warning = keys %Unicode::Map8::ALIASES; while ( my($alias, $charset) = each %Unicode::Map8::ALIASES) { if (exists $set{$charset}) { $set{$charset}{$alias} = 1; } else { warn "$charset does not seem to exist (aliased as $alias)\n"; } } for (sort keys %set) { print "$_"; if (%{$set{$_}}) { print " ", join(" ", sort keys %{$set{$_}}); } print "\n"; } } sub print_version { require Unicode::Map8; my $avoid_warning = $Unicode::Map8::VERSION; print <<"EOT"; This is umap version $VERSION (Unicode-Map8-$Unicode::Map8::VERSION) Copyright 1998, Gisle Aas. This program is free software; you can redistribute it and/or modify it under the same terms as Perl itself. EOT exit 0; } sub usage { (my $progname = $0) =~ s,.*/,,; die "Usage:\t$progname [options] <before>:<after> The options are: --list [charset] list character sets --strict use the strict mapping --def8 <code> default 8-bit code for unmapped chars --def16 <code> default 16-bit code for unmapped chars --version print version number and quit "; } Save