#!/usr/bin/env perl
#
# This script creates a towctrans.h header (or C source) file, musl old-style,
# for simple case mapping as implemented in towlower() and towupper().
# Used for building musl and safeclib for fast and small upper/lowercasing
# tables for towlower() and towupper() and its secure variants
# towupper_s() and towlower_s(). Planned also for the multi-byte
# folding tables for towfc().
#
# The generated code is licensed under the MIT.
#
# Usage:
#    gen_wctrans [-v 17] [--out towctrans.h] [--ud UnicodeData.txt] [--help]
#
# Input files (will be downloaded if missing):
#    UnicodeData.txt  (primary: Simple_Uppercase/Lowercase_Mapping fields)
#
# Output files:
#    towctrans.h or towctrans.c
#    towfc.h (later)

use 5.012;
use strict;
use warnings;
use Carp;

# Minimum size of and excluded range. The more ranges we have, the slower.
# The larger the ranges, the more misses in holes we might have, going through
# all checks. musl-old had ~2500. Tested via bench
my $MIN_ECXL = 2500;

BEGIN {
    unless ( 'A' eq pack( 'U', 0x41 ) ) {
        die "Unicode::Towctrans cannot stringify a Unicode code point\n";
    }
    unless ( 0x41 == unpack( 'U', 'A' ) ) {
        die "Unicode::Towctrans cannot get Unicode code point\n";
    }
}
our $PACKAGE = 'Unicode::Towctrans';
$Unicode::Towctrans::VERSION = '0.04';

use Getopt::Long;
my (
    $v,     $lower16, $bits,    $help, $verbose,
    $safec, $musl,    $bsearch, $bsearch_both,
    $table
);
my $ud            = "UnicodeData.txt";
my $out           = "towctrans.h";
my $fn            = "_towcase";
my $with_iswalpha = 0;
my $cmdline_args  = join( " ", @ARGV );
$cmdline_args = " $cmdline_args" if $cmdline_args;

GetOptions(
    "v=i"           => \$v,                # numeric
    "ud=s"          => \$ud,               # string (UnicodeData.txt)
    "out|o=s"       => \$out,              # string
    "with-iswalpha" => \$with_iswalpha,    # flag
    "min-excl=i"    => \$MIN_ECXL,         # numeric
    "lower16"       => \$lower16,          # flag
    "bits=s"        => \$bits,             # string
    "bsearch"       => \$bsearch,          # flag
    "bsearch-both"  => \$bsearch_both,     # flag
    "table"         => \$table,            # flag
    "fn=s"          => \$fn,               # string
    "musl"          => \$musl,             # flag
    "safec"         => \$safec,            # flag
    "verbose"       => \$verbose,          # flag
    "help|h"        => \$help              # flag
) or die("Error in command line arguments\n");
if ($help) {
    print <<'EOF';
gen_wctrans [OPTIONS]
Generate wide-char case mapping C header file
OPTIONS
-v NUM                for Unicode major version number
--ud UnicodeData.txt  input filename. default: UnicodeData.txt
                      Downloaded if not found.
--out filename        default: towctrans.h
--with-iswalpha       if you can trust iswalpha. not with glibc, only musl.
--min-excl NUM        exclude ranges lower than NUM (default: 2500)
--lower16             use 16bit shorts for casemap.lower instead of 8.
                      moves more ranges from the long to the short first check.
--bits 16:10:8        set other bitsizes for the casemaps struct members
--bsearch             binary search the towlower tables
--bsearch-both        binary search the towupper tables also (bigger)
--table               use the musl-new style two-level base-6 tables (bigger)
--fn name             function name (default: _towcase)
--musl                create towctrans.c for musl. with iswalpha and LOCALE_TR
--safec               create towctrans.c for safeclib, with LOCALE_TR
--verbose
--help
EOF
    exit;
}

if ($table) {
    die "cannot --bsearch with --table" if $bsearch;
    die "cannot --bsearch-both with --table" if $bsearch_both;
    die "cannot --lower16 with --table" if $lower16;
    die "cannot --bits with --table" if $bits;
}
if ($musl) {
    $with_iswalpha = 1;
    $out           = "towctrans.c" if $out eq "towctrans.h";
    $fn            = "__towcase";
}
if ($safec) {
    die "cannot use --sace and --musl together\n" if $musl;
    $out = "towctrans.c"                          if $out eq "towctrans.h";
}
if ( !$v ) {
    use Unicode::UCD;
    my $full = Unicode::UCD::UnicodeVersion();
    ($v) = $full =~ /^(\d+)\./;
}
my ( $f_upper, $f_lower, $f_len, $use_bitfields, $f_sign, @bits );
if ($bits) {
    @bits = split /:/, $bits;
    die "bits should be 3 colon-separated numbers for upper, lower and len"
      if @bits != 3;
    for (@bits) {
        die "Illegal bits size" if $_ < 1 or $_ > 32;
        if ( $_ != 8 and $_ != 16 and $_ != 32 ) {
            $use_bitfields = 1;
        }
    }
}
else {
    @bits = $lower16 ? ( 16, 16, 8 ) : ( 16, 8, 8 );
}
my @cmt = (
    "/* base */",
    "/* distance from upper to lower. 1 with LACE */",
    "/* how many */"
);
$cmt[$_] = ( $bits[$_] >= 10 ? "" : " " ) . $cmt[$_] for 0 .. 2;
if ($use_bitfields) {
    $f_upper = sprintf( "unsigned upper : %u; %s", $bits[0], $cmt[0] );
    $f_sign  = sprintf("unsigned sign  : 1;  /* if negative */");
    $f_lower = sprintf( "unsigned lower : %u; %s", $bits[1], $cmt[1] );
    $f_len   = sprintf( "unsigned len : %u;   %s", $bits[2], $cmt[2] );
}
else {
    $f_upper = sprintf( "uint%u_t upper; %s", $bits[0], $cmt[0] );
    $f_lower = sprintf( "int%u_t lower;  %s", $bits[1], $cmt[1] );
    $f_len   = sprintf( "uint%u_t len;   %s", $bits[2], $cmt[2] );
}

binmode *STDOUT, ':utf8';
binmode *STDERR, ':utf8';
my (
    @map,   @excl, @pair,  $prev, $lc,
    %upper, %lower, @CASE, @CASEL, @PAIR, @PAIRL,
    @CASE_RAW, @CASEL_RAW
);

########## helpers ##########

## converts string "hhhh hhhh hhhh" to a numeric list
## (hex digits separated by spaces)
sub getHexArray { map hex, $_[0] =~ /\G *([0-9A-Fa-f]+)/g }

# find the last 2-member (lace) entry in @map, or undef
sub last_lace {
    my $map = shift;
    for my $i ( reverse 0 .. $#$map ) {
        return $map->[$i] if @{ $map->[$i] } == 2;
    }
    return undef;
}

# find the last 3-member (map) entry in @map, or undef
sub last_map3 {
    my $map = shift;
    for my $i ( reverse 0 .. $#$map ) {
        return $map->[$i] if @{ $map->[$i] } == 3;
    }
    return undef;
}

## simulate the search via the global @map, @excl, @pair arrays
## to be able to leave out an uneeded pair,lace,map. Esp. for upper.
sub simulate {
    my ( $wc, $lower ) = @_;
    die if $lower != 0 && $lower != 1;

    for my $e (@excl) {
        if ( $wc >= $e->[0] && $wc <= $e->[1] ) {
            warn(
                sprintf(
                    "simulate wc %04X in excl [%04X, %04X]\n",
                    $wc, $e->[0], $e->[1]
                )
            ) if $verbose;
            return $wc;
        }
        last if $wc > $e->[1];
    }

    # skip the turkish checks
    my $lmul  = 2 * $lower - 1;    # 1 for lower, -1 for upper
    my $lmask = $lower - 1;        # 0 for lower, -1/0xffff for upper
    # single loop over @map: 3-member = map, 2-member = lace
    for my $m (@map) {
        if ( @$m == 3 ) {
            # MAP entry
            my $lower = map_lower($m);
            my $len   = map_len($m);
            my $base  = $m->[0] + ( $lmask & $lower );
            if ( $wc - $base < $len ) {

                # The only reverse fixup needed. Tested from Unicode 4 to 18.
                # clashes with 1E9B; C; 1E61 */
                if ( !$lower && $wc == 0x1E61 ) {
                    return 0x1E60;
                }
                else {
                    return $wc + $lmul * $lower;
                }
            }
        }
        elsif ( @$m == 2 ) {
            # LACE entry
            my $lower = 1;    # lace_lower is always 1
            my $len   = $m->[1] - $m->[0];    # lace_len_1
            my $base  = $m->[0] + ( $lmask & $lower );
            if ( $wc - $base < $len ) {

                # The only reverse fixup needed. Tested from Unicode 4 to 18.
                # clashes with 1E9B; C; 1E61 */
                if ( !$lower && $wc == 0x1E61 ) {
                    return 0x1E60;
                }
                else {
                    return $wc + $lmul * $lower;
                }
            }
        }
    }

    # pairs
    for my $p (@pair) {
        if ( $p->[ 1 - $lower ] == $wc ) {
            return $p->[$lower];
        }
        if ( $lower && $p->[0] > $wc ) {
            last;
        }
    }
    return $wc;
}

########## writing header files ##########

#         from    until to (=lower of from)
# CASEMAP(0x00c0, 0xd6, 0xe0), // 192 32 23
# CASEMAP(0x00d8, 0xde, 0xf8), // 216 32 7
# i.e. 100-101,102-103...12e-12f
# CASELACE(0x0100, 0x12e), // 256 1 47
# CASELACE(0x0132, 0x136), // 306 1 5
# CASELACE(0x0139, 0x147), // 313 1 15
# CASELACE(0x014a, 0x176), // 330 1 45
# CASELACE(0x0179, 0x17d), // 377 1 5
# CASELACE(0x01a0, 0x1a4), // 416 1 5 O WITH HORN - P WITH HOOK
# CASELACE(0x01b3, 0x1b5), // 435 1 3
# CASELACE(0x01cd, 0x1db), // 461 1 15

sub map_lower {
    my $m = shift;
    return $m->[2] - $m->[0];
}

# note that our lace->[1] is one too much
sub map_len {
    my $m = shift;
    return $m->[1] - $m->[0] + 1;
}

sub map_range {
    my $m = shift;
    return ( $m->[0], $m->[2] );
}

sub map_clashes {

    # eg   CASEMAP(0x01f1, 0x01f1, 0x01f3) (ie pair 1f1, 1f3)
    # with CASELACE(0x01f2, 0x01f4) (ie pair 1f2, 1f3) => true
    # needed for upper 01F3 => 01F2 by lace, 01F3 => 01F2 by map.
    # Handles both 2-member (lace) and 3-member (map) entries.
    my ( $m, $l ) = @_;
    return 0 if !defined($m) or !defined($l);
    # normalize lace (2-member) to 3-member for comparison
    my $l3 = @$l == 2 ? [ $l->[0], $l->[1], $l->[0] + 1 ] : $l;
    my $l_len = map_len($l3);
    my $l_max = $l3->[1] + $l_len;
    my $m3 = @$m == 2 ? [ $m->[0], $m->[1], $m->[0] + 1 ] : $m;
    return 1 if $m3->[0] >= $l3->[0] && $m3->[2] <= $l_max;
}

# triple of base, last, to (3-member), or pair of base, last (2-member lace).
# offset is to - base, length is last - base + 1
# $base is upper, $to is lower
# We might want set a flag when lower is < 0, because then the
# casemap range check cannot be used. But from unicode 4..18 it always is.
# eg. 1f7 starts at 1b7 already.
sub add_map {
    my ( $map, $base, $to ) = @_;

    my $last3 = last_map3($map);    # last 3-member (map) entry
    my $diff  = $to - $base;
    my $olen  = $last3 ? $last3->[1] - $last3->[0] : -1;
    my $odiff = $last3 ? $last3->[2] - $last3->[0] : 0;

    # if it's the next cp and has the same offset
    if ( $last3 and $last3->[1] == $base - 1 && $diff == $odiff ) {
        ++$last3->[1];
        if ($verbose) {
            warn sprintf( "bump map [%04X, %04X, %04X]\n", @$last3 );
        }
    }

    # check if the previous map has only len 1,
    # convert to pair then.
    elsif ( $last3
        && map_len($last3) == 1 )
    {
        my $ll = last_lace($map);
        if ( map_clashes( $last3, $ll ) ) {
            warn(
                sprintf(
                    "last map %04X, %04X clashes with last lace %04X, %04X\n",
                    $last3->[0], $last3->[2], @$ll
                )
            ) if $verbose;
        }
        add_pair( \@pair, $last3->[0], $last3->[2] );
        warn(
            sprintf( "convert short map to pair %04X, %04X\n", @{ $pair[-1] } )
        ) if $verbose;
        $last3->[0] = $base;
        $last3->[1] = $base;
        $last3->[2] = $to;
        if ($verbose) {
            warn sprintf( "new map [%04X, %04X, %04X]\n", @$last3 );
        }
    }
    else {
        my $ll = last_lace($map);
        if ( $diff == 1 and map_clashes( [ $base, $base, $to ], $ll ) ) {

            # do we need this pair?
            if ( simulate( $base, 0 ) != $to && simulate( $to, 1 ) != $base ) {
                warn(
                    sprintf(
"add pair: new map %04X, %04X clashes with last lace %04X, %04X\n",
                        $base, $to, @$ll
                    )
                ) if $verbose;
                add_pair( \@pair, $base, $to );
            }
            else {
                warn( sprintf( "pair %04X, %04X not needed\n", $base, $to ) )
                  if $verbose;
            }
        }
        else {
            push @$map, [ $base, $base, $to ];
            if ($verbose) {
                warn sprintf( "new map [%04X, %04X, %04X]\n", @{ $map->[-1] } );
            }
        }
    }
}

# pair of base, last with lower offset of 1 (2-member entry in @map)
sub add_lace {
    my ( $map, $base, $to ) = @_;

    my $ll = last_lace($map);    # last 2-member (lace) entry in @map

    # if it's the next cp
    if (   $ll
        && $ll->[1] == $to - 2 )
    {
        $ll->[1] = $to;
        warn( sprintf( "bump lace %04X, %04X\n", @$ll ) )
          if $verbose;
    }

    # check if the previous lace has only len 1, convert to pair then
    # lace_len_1($ll) == $ll->[1] - $ll->[0]
    elsif ( $ll
        && ( $ll->[1] - $ll->[0] ) == 1 )
    {
        my $lm = last_map3($map);
        if ( map_clashes( $lm, $ll ) ) {
            warn(
                sprintf(
                    "last lace %04X, %04X clashes with last map %04X, %04X\n",
                    @$ll,
                    $lm->[0], $lm->[2]
                )
            ) if $verbose;
        }
        warn(
            sprintf( "convert short lace to pair %04X, %04X\n",
                @$ll )
        ) if $verbose;
        add_pair( \@pair, $ll->[0], $ll->[1] );
        $ll->[0] = $base;
        $ll->[1] = $to;
        warn( sprintf( "new lace %04X, %04X\n", @$ll ) )
          if $verbose;
    }
    else {
        my $lm = last_map3($map);
        if ( map_clashes( $lm, [ $base, $to ] ) ) {
            # Clash with last map: demote to pair instead of lace
            warn(
                sprintf(
"new lace %04X, %04X clashes with last map %04X, %04X => pair\n",
                    $base, $to, $lm->[0], $lm->[2]
                )
            ) if $verbose;
            add_pair( \@pair, $base, $to );
        }
        else {
            push @$map, [ $base, $to ];
            if ($verbose) {
                warn sprintf( "new lace %04X, %04X\n", @{ $map->[-1] } );
            }
        }
    }
}

# also check the reverse if deviating. upper(03BC) => 039C, not B5
sub add_pair {
    my ( $pair, $base, $to ) = @_;

    # do we need this pair?
    if ( simulate( $base, 0 ) != $to && simulate( $to, 1 ) != $base ) {
        push @$pair, [ $base, $to ];
    }
    else {
        # same result without this pair already
        warn( sprintf( "pair %04X, %04X not needed\n", $base, $to ) )
          if $verbose;
    }
}

# exclude a pair of first, last.
# also observe the existing lhs cp and rhs lower mappings
sub add_excl {
    my ( $excl, $base ) = @_;

    if ( exists $lower{$base} or exists $upper{$base} ) {
        warn( sprintf( "skip excl %04X\n", $base ) ) if $verbose;
        return;
    }

    # if it's the next cp
    if ( $excl->[-1] && $excl->[-1][1] == $base - 1 ) {
        ++$excl->[-1][1];    # extend range
        warn( sprintf( "bump excl [%04X, %04X]\n", @{ $excl->[-1] } ) )
          if $verbose;
    }
    else {
        push @$excl, [ $base, $base ];    # new range
        warn( sprintf( "new excl [%04X, %04X]\n", @{ $excl->[-1] } ) )
          if $verbose;
    }
}

if ( $v and !-s $ud ) {
    my $url = "https://www.unicode.org/Public/$v.0.0/ucd/UnicodeData.txt";
    `wget $url -O $ud`
        or `wget --no-check-certificate $url -O $ud`;
    unless (-s $ud) {
        `curl --silent $url -O $ud`;
    }
    unless (-s $ud) {
        require LWP::Simple;
        my $content = LWP::Simple::get($url);
        open my $fh, ">", $ud or die "$PACKAGE: failed to download $url";
        print $fh $content;
        close $fh;
    }
    unless (-s $ud) {
        die "$PACKAGE: failed to download $url: $!" unless -s $ud;
    }
}
open my $UD, "<", $ud or croak "$PACKAGE: $ud can't be read $!";

# Read UnicodeData.txt to build:
# 1. %upper (uppercase->lowercase, from field 13: Simple_Lowercase_Mapping)
# 2. %lower (lowercase->uppercase, from field 12: Simple_Uppercase_Mapping)
#
# UnicodeData.txt fields (0-indexed, semicolon-separated):
#   field 0:  codepoint hex
#   field 2:  General_Category (Lu, Ll, Lt, etc.)
#   field 12: Simple_Uppercase_Mapping
#   field 13: Simple_Lowercase_Mapping

while ( my $l = <$UD> ) {
    chomp $l;
    next if $l =~ /^\s*#/;
    next if $l =~ /^\s*$/;
    my @fields = split /;/, $l;
    my $cp = hex($fields[0]);

    # Auto-detect version from the first codepoint range comment if needed
    # (UnicodeData.txt has no version header, rely on -v or UCD fallback)

    my $uc_map = $fields[12];
    my $lc_map = $fields[13];

    # %upper maps uppercase->lowercase (codepoints with a lowercase mapping)
    if ( defined $lc_map && $lc_map =~ /^[0-9A-F]+$/i ) {
        my $lc_val = hex($lc_map);
        $upper{$cp} = $lc_val;
    }
    # %lower maps lowercase->uppercase (codepoints with an uppercase mapping)
    if ( defined $uc_map && $uc_map =~ /^[0-9A-F]+$/i ) {
        my $uc_val = hex($uc_map);
        $lower{$cp} = $uc_val;
    }
}
close $UD;

warn sprintf("UnicodeData.txt: %d upper (uc->lc) and %d lower (lc->uc) mappings\n",
    scalar keys %upper, scalar keys %lower)
  if $verbose;

# Build @map, @pair from %upper (uppercase->lowercase mappings).
# Process in codepoint order, detecting consecutive ranges (maps, laces)
# and isolated pairs.  This replaces the old CaseFolding.txt second scan.
# @map now contains both 3-member maps and 2-member laces.
for my $cp (sort { $a <=> $b } keys %upper) {
    $lc = $upper{$cp};

    if ( !@map ) {
        add_map( \@map, $cp, $lc );    # 'A' -> 'a'
    }
    else {
        if ( $cp - $prev == 1 ) {              # consecutive codepoint
            if ( $lc - $cp == 1 ) {

                # check if we can convert the previous pair to a lace
                if (    @pair
                    and $pair[-1][0] == $cp - 1
                    and $pair[-1][1] == $lc - 1 )
                {
                    warn(
                        sprintf(
                            "convert old pair to lace %04X %04X\n",
                            $cp, $lc
                        )
                    ) if $verbose;
                    pop @pair;
                    add_lace( \@map, $cp - 1, $lc - 1 );
                    add_lace( \@map, $cp,     $lc );
                }
                else {
                    add_lace( \@map, $cp, $lc );
                }
            }
            else {
                # check if we can convert the previous pair to a map
                if (    @pair
                    and $pair[-1][0] == $cp - 1
                    and $pair[-1][1] == $lc - 1 )
                {
                    if ($verbose) {
                        warn
                          sprintf( "convert old pair to map %04X, %04X\n",
                            $cp, $lc );
                    }
                    pop @pair;
                    add_map( \@map, $cp - 1, $lc - 1 );
                    add_map( \@map, $cp,     $lc );
                }
                else {
                    add_map( \@map, $cp, $lc );
                }
            }
        }
        else {    # not consecutive, a hole
            if ( $lc - $cp == 1 ) {
                add_lace( \@map, $cp, $lc );
            }
            else {    # add a hole
                warn( sprintf( "add_pair %04X, %04X\n", $cp, $lc ) )
                  if $verbose;
                add_pair( \@pair, $cp, $lc );
            }
        }
    }
    $prev = $cp;
}

# Add extra pairs from %lower for towupper of characters whose
# Simple_Uppercase_Mapping is not the reverse of any %upper entry.
# These are asymmetric mappings (e.g., micro sign 00B5 -> 039C Mu capital,
# but Mu capital -> 03BC mu small, not back to micro sign).
if (!$table) {
    for my $lc_cp (sort { $a <=> $b } keys %lower) {
        my $uc_target = $lower{$lc_cp};
        # Skip if already covered: the uppercase target maps back to this char
        next if exists $upper{$uc_target} && $upper{$uc_target} == $lc_cp;
        # Add as a pair [uc_target, lc_cp] so towupper(lc_cp) returns uc_target
        warn(sprintf("extra pair %04X, %04X (from %%lower)\n", $uc_target, $lc_cp))
          if $verbose;
        push @pair, [ $uc_target, $lc_cp ];
    }
    # Re-sort pairs by upper codepoint (they must be sorted for the early-break optimization).
    # When two pairs share the same upper, put the authoritative towlower mapping
    # (from %upper) first so that the linear search finds it before the extra
    # towupper-only pair.
    @pair = sort {
        $a->[0] <=> $b->[0]
        || ( exists $upper{$a->[0]} && $upper{$a->[0]} == $a->[1] ? -1
           : exists $upper{$b->[0]} && $upper{$b->[0]} == $b->[1] ?  1
           : $a->[1] <=> $b->[1] )
    } @pair;
}

my $ucd_version = "$v.0.0";
my @h_args      = ( $Unicode::Towctrans::VERSION, $cmdline_args, $ucd_version );

if ( !-w $out ) {
    chmod 0644, $out;
}
open FH, ">:utf8", $out or croak "$PACKAGE: $out can't be written $!";
printf FH <<'EOF', @h_args;
/* ex: set ro ft=c: -*- buffer-read-only: t -*-
 *
 * !!!!!!!   DO NOT EDIT THIS FILE   !!!!!!!
 * This file is auto-generated by Unicode::Towctrans %s
 * gen_wctrans%s
 * for Unicode %s
 * Any changes here will be lost!
 */
EOF

if ( $musl or $safec ) {
    print FH "#define HAVE_LOCALE_TR\n";
}

print FH <<'EOF';
/*
Copyright (c) 2005-2014 Rich Felker, et al.
Copyright (c) 2018,2020,2026 Reini Urban

--------------------------------------------------------------
This code is licensed under the following standard MIT license
--------------------------------------------------------------

Permission is hereby granted, free of charge, to any person obtaining
a copy of this software and associated documentation files (the
"Software"), to deal in the Software without restriction, including
without limitation the rights to use, copy, modify, merge, publish,
distribute, sublicense, and/or sell copies of the Software, and to
permit persons to whom the Software is furnished to do so, subject to
the following conditions:

The above copyright notice and this permission notice shall be
included in all copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
----------------------------------------------------------------------
*/

#include <assert.h>
#include <stdint.h>
#include <wctype.h>
#ifdef HAVE_LOCALE_TR
#include <locale.h>
#include <string.h>
#endif

EOF

if ($table) {
    # --table: generate musl-new style two-level base-6 tables
    # Uses %lower (lc->uc) and %upper (uc->lc) from UnicodeData.txt

    printf FH <<'EOF', ( $ucd_version, $v );

/* for Unicode %s */
#define TOWCTRANS_UNICODE_VERSION %u

EOF

    # Helper: construct a rule as a 32-bit signed integer string key.
    # rule = (delta << 8) | type, stored as a string to avoid Perl 64-bit issues.
    # Returns the string representation of the 32-bit signed int.
    sub rule_int32 {
        my ($delta, $type) = @_;
        # Compute as native int, then mask to 32 bits
        my $val = ($delta << 8) | $type;
        # Reinterpret as signed 32-bit
        $val = unpack('l', pack('l', $val));
        return $val;
    }

    # Format a rule int for C output
    sub fmt_rule {
        my ($r) = @_;
        if ($r == 0) { return "0x0"; }
        # Ensure signed 32-bit interpretation
        $r = unpack('l', pack('l', $r));
        if ($r >= 0) {
            return sprintf("0x%x", $r);
        } else {
            return sprintf("-0x%x", -$r);
        }
    }

    # Build a mapping from codepoint -> (delta, direction)
    # Uses the musl-old convention: lower=1 means towlower, lower=0 means towupper.
    # With the rt^dir formula:
    #   rule type 0: lowercase delta, applied when dir=1 (towlower)
    #   rule type 1: uppercase delta, applied when dir=0 (towupper)

    my $NBLOCKS = 512;   # covers 0x00000 - 0x1FFFF
    my $BLOCKSZ = 256;

    # Per-codepoint rule assignment.
    # Each rule is: (delta << 8) | type, as a 32-bit signed int.
    my %cp_rule;    # codepoint -> rule_int (32-bit signed)

    # %lower maps lowercase->uppercase, %upper maps uppercase->lowercase
    # (variable names below are historical and misleading)

    # Codepoints in %lower (lowercase chars): delta goes uppercase-ward.
    # type 1: applied when dir=0 (towupper)
    for my $uc (keys %lower) {
        my $lc = $lower{$uc};
        next if $lc == $uc;
        $cp_rule{$uc} = rule_int32($lc - $uc, 1);
    }

    # Codepoints in %upper (uppercase chars): delta goes lowercase-ward.
    # type 0: applied when dir=1 (towlower)
    for my $lc (keys %upper) {
        my $uc = $upper{$lc};
        next if $uc == $lc;
        $cp_rule{$lc} = rule_int32($uc - $lc, 0);
    }

    # Codepoints with BOTH a lowercase and an uppercase mapping
    # (e.g., titlecase characters like U+01C5).
    my %has_both;
    for my $cp (keys %cp_rule) {
        if (exists $lower{$cp} && $lower{$cp} != $cp &&
            exists $upper{$cp} && $upper{$cp} != $cp) {
            $has_both{$cp} = 1;
        }
    }

    # For each 256-codepoint block, collect distinct rules.
    # Max 6 per block (base-6 encoding). Overflow -> exceptions.
    my @block_rules;      # [$block] -> [rule_int, ...] (up to 6, may include 'EXC')
    my @block_rule_map;   # [$block]{$cp_offset} -> rule_index (0-5)
    my @block_exceptions; # [$block] -> [{cp_low, rule_int}, ...]

    for my $blk (0 .. $NBLOCKS - 1) {
        my $base = $blk * $BLOCKSZ;
        my %block_rule_counts;
        my %cp_to_rule;

        for my $offset (0 .. $BLOCKSZ - 1) {
            my $cp = $base + $offset;
            my $rule = $cp_rule{$cp} // 0;
            $cp_to_rule{$offset} = $rule;
            $block_rule_counts{$rule}++;
        }

        # Distinct rules sorted by frequency (most frequent first)
        my @distinct = sort { $block_rule_counts{$b} <=> $block_rule_counts{$a}
                              || $a <=> $b } keys %block_rule_counts;

        my @block_r;
        my @exception_rules;

        # Identity (0x0) must always be first if present
        if (exists $block_rule_counts{0}) {
            push @block_r, 0;
            @distinct = grep { $_ != 0 } @distinct;
        }

        for my $r (@distinct) {
            if (@block_r < 6) {
                push @block_r, $r;
            } else {
                push @exception_rules, $r;
            }
        }

        # Build exception list
        my @exc;
        my %exc_rules_set = map { $_ => 1 } @exception_rules;
        for my $offset (0 .. $BLOCKSZ - 1) {
            my $cp = $base + $offset;
            if ($has_both{$cp}) {
                # %lower maps lc->uc, %upper maps uc->lc (confusing names)
                my $uc_target_delta = $lower{$cp} - $cp;  # delta to uppercase
                my $lc_target_delta = $upper{$cp} - $cp;  # delta to lowercase
                if ($uc_target_delta == -1 && $lc_target_delta == 1) {
                    # Titlecase: type 3 (towlower: +1, towupper: -1)
                    push @exc, { cp_low => $offset, rule_int => 0x3 };
                } else {
                    push @exc, { cp_low => $offset, rule_int => $cp_rule{$cp} };
                }
                $cp_to_rule{$offset} = undef;
            } elsif ($exc_rules_set{$cp_to_rule{$offset} // 0}) {
                push @exc, { cp_low => $offset, rule_int => $cp_to_rule{$offset} };
                $cp_to_rule{$offset} = undef;
            }
        }

        # If we have exceptions, we need an EXC rule slot in block_r
        my $exc_rule_idx;
        if (@exc) {
            if (@block_r >= 6) {
                # Evict the least-frequent non-identity rule
                my $evicted = pop @block_r;
                for my $offset (0 .. $BLOCKSZ - 1) {
                    if (defined $cp_to_rule{$offset} && $cp_to_rule{$offset} == $evicted) {
                        push @exc, { cp_low => $offset, rule_int => $evicted };
                        $cp_to_rule{$offset} = undef;
                    }
                }
            }
            $exc_rule_idx = scalar @block_r;
            push @block_r, 'EXC';
        }

        # Build the rule_map for this block (cp_offset -> 0-5 index)
        my %rule_map;
        my %rule_pos;
        for my $ri (0 .. $#block_r) {
            $rule_pos{$block_r[$ri]} = $ri if $block_r[$ri] ne 'EXC';
        }

        for my $offset (0 .. $BLOCKSZ - 1) {
            if (!defined $cp_to_rule{$offset}) {
                # Exception codepoint
                $rule_map{$offset} = $exc_rule_idx // 0;
            } else {
                $rule_map{$offset} = $rule_pos{$cp_to_rule{$offset}} // 0;
            }
        }

        $block_rules[$blk]      = \@block_r;
        $block_rule_map[$blk]   = \%rule_map;
        $block_exceptions[$blk] = \@exc;
    }

    # Build the global rules[] array.
    # Strategy: rules[] is a single flat array containing:
    #   1. Deduplicated block rule sequences (contiguous per block)
    #   2. Exception-referenced rules (deduplicated globally)
    # rulebases[b] points to where block b's rules start in rules[].
    # Blocks with identical rule sequences share the same rulebase.
    #
    # Two passes:
    # Pass 1: Build block rule sequences (without EXC rules, which need
    #         exception indices not yet known). Deduplicate sequences.
    # Pass 2: Add exception-referenced rules, then fill in EXC rule entries.

    my @g_rules;       # the rules[] array emitted to C
    my @g_rulebases;   # one per block
    my @g_exceptions;  # [{cp_low, rule_idx}]

    # For each block, build its rule sequence key (for deduplication).
    # A block's rule sequence is its list of non-EXC rule values.
    # Blocks with the same non-EXC rules AND no exceptions can share a rulebase.
    # Blocks with exceptions get unique entries (because the EXC rule encodes
    # block-specific exception offsets).

    my %seq_to_base;   # rule-sequence-key -> index in @g_rules

    # Pass 1: Allocate rule sequences
    for my $blk (0 .. $NBLOCKS - 1) {
        my $block_r   = $block_rules[$blk];
        my $block_exc = $block_exceptions[$blk];
        my $has_exc   = grep { $_ eq 'EXC' } @$block_r;

        # Build the non-EXC rule sequence
        my @non_exc = map { $_ eq 'EXC' ? () : $_ } @$block_r;
        my $seq_key = join(',', @non_exc);

        if (!$has_exc && exists $seq_to_base{$seq_key}) {
            # Reuse existing rulebase
            $g_rulebases[$blk] = $seq_to_base{$seq_key};
        } else {
            $g_rulebases[$blk] = scalar @g_rules;
            $seq_to_base{$seq_key} = $g_rulebases[$blk] unless $has_exc;

            for my $ri (0 .. $#$block_r) {
                if ($block_r->[$ri] eq 'EXC') {
                    # Placeholder — will be filled in pass 2
                    push @g_rules, 'EXC_PLACEHOLDER';
                    # Remember where this placeholder is and which block
                    $block_rules[$blk][$ri] = { _exc_pos => $#g_rules, _blk => $blk };
                } else {
                    push @g_rules, $block_r->[$ri];
                }
            }
        }
    }

    # Pass 2: Add exception-referenced rules to @g_rules (deduplicated),
    # then fill in EXC placeholders.
    my %exc_rule_to_gidx;  # rule_int -> index in @g_rules

    for my $blk (0 .. $NBLOCKS - 1) {
        my $block_r   = $block_rules[$blk];
        my $block_exc = $block_exceptions[$blk];

        for my $ri (0 .. $#$block_r) {
            next unless ref $block_r->[$ri] && exists $block_r->[$ri]{_exc_pos};
            my $exc_pos = $block_r->[$ri]{_exc_pos};

            my $xb = scalar @g_exceptions;
            my $xn = scalar @$block_exc;

            # Sort exceptions by cp_low for binary search
            my @sorted_exc = sort { $a->{cp_low} <=> $b->{cp_low} } @$block_exc;

            for my $exc (@sorted_exc) {
                my $r = $exc->{rule_int};
                if (!exists $exc_rule_to_gidx{$r}) {
                    $exc_rule_to_gidx{$r} = scalar @g_rules;
                    push @g_rules, $r;
                }
                push @g_exceptions, {
                    cp_low   => $exc->{cp_low},
                    rule_idx => $exc_rule_to_gidx{$r}
                };
            }

            # Fill in the EXC placeholder: type 2, delta = (xb << 8 | xn)
            my $exc_rule = (($xb << 8) | $xn) << 8 | 2;
            $g_rules[$exc_pos] = $exc_rule;
        }
    }

    # Build tab[]: two-level base-6 table
    # Level 1: 512 bytes (one per block), value * 86 + x indexes into combined array
    # Level 2: 86-byte blocks appended after level 1
    # The level 1 values must be offset so that val*86 reaches past the 512-byte
    # level 1 region into the level 2 data.
    # Minimum usable index: ceil(512/86) = 6 (6*86 = 516 > 512)

    use POSIX qw(ceil);
    my $L2_OFFSET = ceil($NBLOCKS / 86);  # = 6

    my @tab_l2_blocks;  # array of 86-byte arrayrefs
    my %tab_l2_dedup;   # block-string -> index (0-based, before offset)
    my @tab_l1;         # 512 entries

    for my $blk (0 .. $NBLOCKS - 1) {
        my $rule_map = $block_rule_map[$blk];
        my @block86;

        for my $x (0 .. 85) {
            my @vals;
            for my $y (0 .. 2) {
                my $offset = $x * 3 + $y;
                if ($offset < $BLOCKSZ) {
                    push @vals, ($rule_map->{$offset} // 0);
                } else {
                    push @vals, 0;
                }
            }
            # Encode 3 values (0-5) into one byte using reversed order
            # for compatibility with the mt[] decode: v[2]*36 + v[1]*6 + v[0]
            my $byte = $vals[2] * 36 + $vals[1] * 6 + $vals[0];
            push @block86, $byte;
        }

        # Deduplication
        my $key = join(',', @block86);
        if (!exists $tab_l2_dedup{$key}) {
            $tab_l2_dedup{$key} = scalar @tab_l2_blocks;
            push @tab_l2_blocks, \@block86;
        }
        # Level 1 value = dedup index + L2_OFFSET
        $tab_l1[$blk] = $tab_l2_dedup{$key} + $L2_OFFSET;
    }

    # The combined tab[] array:
    # bytes 0..511: level 1 (block indices with offset)
    # bytes 512..(512 + nblocks*86 - 1): level 2 blocks
    # But level 2 starts at byte L2_OFFSET*86 = 516, so bytes 512..515 are padding.
    # We need to pad with zeros between level 1 end and level 2 start.
    my $l2_start_byte = $L2_OFFSET * 86;  # = 516
    my $pad_bytes = $l2_start_byte - $NBLOCKS;  # = 4

    # --- Emit tab[] ---
    print FH "static const unsigned char tab[] = {\n";

    # Level 1: 512 block indices
    for my $row (0 .. ($NBLOCKS / 16) - 1) {
        my @vals = @tab_l1[$row * 16 .. $row * 16 + 15];
        printf FH "\t%s,\n", join(', ', @vals);
    }

    # Padding bytes between level 1 and level 2
    if ($pad_bytes > 0) {
        printf FH "\t%s,\n", join(', ', (0) x $pad_bytes);
    }

    # Level 2: 86-byte blocks
    for my $bi (0 .. $#tab_l2_blocks) {
        my $block = $tab_l2_blocks[$bi];
        for my $row (0 .. int(85 / 16)) {
            my $end = $row * 16 + 15;
            $end = 85 if $end > 85;
            my @vals = @$block[$row * 16 .. $end];
            printf FH "\t%s,\n", join(', ', @vals);
        }
    }
    print FH "};\n";

    # --- Emit rules[] ---
    print FH "static const int rules[] = {\n";
    my @rule_strs = map { fmt_rule($_) } @g_rules;
    for my $row (0 .. int($#g_rules / 6)) {
        my $end = $row * 6 + 5;
        $end = $#g_rules if $end > $#g_rules;
        printf FH "\t%s,\n", join(', ', @rule_strs[$row * 6 .. $end]);
    }
    print FH "};\n";

    # --- Emit rulebases[] ---
    my $max_rb = 0;
    for my $rb (@g_rulebases) { $max_rb = $rb if $rb > $max_rb; }
    my $rb_type = $max_rb > 255 ? "unsigned short" : "unsigned char";
    printf FH "static const %s rulebases[] = {\n", $rb_type;
    for my $row (0 .. ($NBLOCKS / 16) - 1) {
        my @vals = @g_rulebases[$row * 16 .. $row * 16 + 15];
        printf FH "\t%s,\n", join(', ', @vals);
    }
    print FH "};\n";

    # --- Emit exceptions[][] ---
    my $max_exc_idx = 0;
    for my $e (@g_exceptions) {
        $max_exc_idx = $e->{rule_idx} if $e->{rule_idx} > $max_exc_idx;
    }
    my $exc_type = $max_exc_idx > 255 ? "unsigned short" : "unsigned char";
    printf FH "static const %s exceptions[][2] = {\n", $exc_type;
    for my $row (0 .. int($#g_exceptions / 4)) {
        my $end = $row * 4 + 3;
        $end = $#g_exceptions if $end > $#g_exceptions;
        my @entries;
        for my $i ($row * 4 .. $end) {
            push @entries, sprintf("{ %u, %u }", $g_exceptions[$i]{cp_low}, $g_exceptions[$i]{rule_idx});
        }
        printf FH "\t%s,\n", join(', ', @entries);
    }
    print FH "};\n";

    # --- casemap() function ---
    print FH "\n";
    printf FH "static int %s(unsigned c, int lower)\n", $fn;
    print FH <<'EOF';
{
	unsigned b, x, y, v, rt, xb, xn;
	int r, rd, c0 = c;

	if (c >= 0x20000) return c;

	b = c>>8;
	c &= 255;
	x = c/3;
	y = c%3;

	/* lookup entry in two-level base-6 table */
	v = tab[tab[b]*86+x];
	static const int mt[] = { 2048, 342, 57 };
	v = (v*mt[y]>>11)%6;

	/* use the bit vector out of the tables as an index into
	 * a block-specific set of rules and decode the rule into
	 * a type and a case-mapping delta. */
	r = rules[rulebases[b]+v];
	rt = r & 255;
	rd = r >> 8;

	/* rules 0/1 are simple lower/upper case with a delta.
	 * apply according to desired mapping direction.
	 * lower=1: towlower, lower=0: towupper. */
	if (rt < 2) return c0 + (rd & -(rt^lower));

	/* binary search. endpoints of the binary search for
	 * this block are stored in the rule delta field. */
	xn = rd & 0xff;
	xb = (unsigned)rd >> 8;
	while (xn) {
		unsigned try = exceptions[xb+xn/2][0];
		if (try == c) {
			r = rules[exceptions[xb+xn/2][1]];
			rt = r & 255;
			rd = r >> 8;
			if (rt < 2) return c0 + (rd & -(rt^lower));
			/* Hard-coded for the four exceptional titlecase */
			return c0 + (lower ? 1 : -1);
		} else if (try > c) {
			xn /= 2;
		} else {
			xb += xn/2;
			xn -= xn/2;
		}
	}
	return c0;
}

EOF

    # Wrapper functions
    if ($musl) {
        print FH <<"EOF";

wint_t towupper(wint_t wc)
{
	return (unsigned)wc < 128 ? toupper(wc) : (wint_t)$fn(wc, 1);
}

wint_t towlower(wint_t wc)
{
	return (unsigned)wc < 128 ? tolower(wc) : (wint_t)$fn(wc, 0);
}

wint_t __towupper_l(wint_t c, locale_t l)
{
	return towupper(c);
}

wint_t __towlower_l(wint_t c, locale_t l)
{
	return towlower(c);
}

weak_alias(__towupper_l, towupper_l);
weak_alias(__towlower_l, towlower_l);
EOF
    }
    elsif ($safec) {
        print FH <<"EOF";

EXPORT uint32_t _towupper(uint32_t wc) {
    return wc < 128 ? (uint32_t)toupper(wc) : (uint32_t)$fn(wc, 1);
}
#ifndef HAVE_TOWUPPER
EXPORT uint32_t towupper(uint32_t wc) {
    return wc < 128 ? (uint32_t)toupper(wc) : (uint32_t)$fn(wc, 1);
}
#endif

#ifndef HAVE_TOWLOWER
EXPORT uint32_t towlower(uint32_t wc) {
    return (unsigned)wc < 128 ? (wint_t)tolower(wc) : (wint_t)$fn(wc, 0);
}
#endif
EOF
    }

    close FH;
    chmod 0444, $out;
    exit 0;
}

if ($use_bitfields) {
    print FH <<'EOF';
/* map from upper until upper, to lower. sign 1 for negative. */
#define CASEMAP(u1, u2, l) {(u1), (l) < (u1) ? 1 : 0, \
        (l) < (u1) ? (u1) - (l) : (l) - (u1), (u2) - (u1) + 1}
/* long variant, without sign bit. */
#define CASEMAPL(u1, u2, l) {(u1), (l) - (u1), (u2) - (u1) + 1}
/* map from upper until lower, with dist 1 */
#define CASELACE(u1, u2) CASEMAP((u1), (u2), (u1) + 1)
#define CASELACEL(u1, u2) CASEMAPL((u1), (u2), (u1) + 1)
EOF
}
else {
    print FH <<'EOF';
/* map from upper until upper, to lower */
#define CASEMAP(u1, u2, l) {(u1), (l) - (u1), (u2) - (u1) + 1}
/* map from upper until lower, with dist 1 */
#define CASELACE(u1, u2) CASEMAP((u1), (u2), (u1) + 1)
EOF
}

printf FH <<'EOF', ( $ucd_version, $v );

/* for Unicode %s */
#define TOWCTRANS_UNICODE_VERSION %u

EOF

# iterate @map (3-member maps and 2-member laces interleaved), sorted by base
my @sorted_map = sort { $a->[0] <=> $b->[0] } @map;
my ( $case_min, $case_max, $casel_min, $casel_max );
$case_min = $sorted_map[0];
$case_max = $sorted_map[0];

my $has_long;
my $max_upper = ( 2**$bits[0] ) - 1;             # 16: 0xffff;
my $max_lower = ( ( 2**$bits[1] ) - 1 ) >> 1;    # 16: 32767, 8: 128;
my $min_lower = -( $max_lower - 1 );
my $max_len   = ( 2**$bits[2] ) - 1;             # 8: 255;

for my $entry (@sorted_map) {

    if ( @$entry == 3 ) {
        # MAP entry
        my $m = $entry;
        my $lower = map_lower($m);
        my $len   = map_len($m);

        # short or long
        my $is_long = (
                 $m->[0] > $max_upper
              or $lower > $max_lower
              or $lower < $min_lower
              or $len > $max_len
        );

        # on overflow push to @CASEL instead
        if ($is_long) {
            my $CASEMAP = $use_bitfields ? "CASEMAPL" : "CASEMAP";
            $casel_min = $m unless $casel_min;
            $casel_max = $m
              if !defined($casel_max)
              or $casel_max->[0] < $m->[0];
            my $cmt = sprintf " /* '%c'->'%c'..'%c' {, %d, %u} */",
              $m->[0], $m->[1], $m->[2], map_lower($m), map_len($m);
            my $s = sprintf "    $CASEMAP(0x%05x, 0x%05x, 0x%05x),%s\n",
              $m->[0], $m->[1], $m->[2], $cmt;

            #warn "defer overflow $s0 to casemapl\n" if $verbose;
            push @CASEL, $s;
            push @CASEL_RAW, [$m->[0], $lower, $len, $s];
        }
        else {
            my $CASEMAP = "CASEMAP";
            $case_max = $m if $case_max->[0] < $m->[0];
            my $cmt = sprintf " /* '%c'->'%c'..'%c' {, %d, %u} */",
              $m->[0], $m->[1], $m->[2], map_lower($m), map_len($m);
            my $s = sprintf "    $CASEMAP(0x%04x, 0x%04x, 0x%04x),%s\n",
              $m->[0],
              $m->[1], $m->[2], $cmt;
            push @CASE, $s;
            push @CASE_RAW, [$m->[0], $lower, $len, $s];
        }
    }
    elsif ( @$entry == 2 ) {
        # LACE entry
        my $l = $entry;
        $l->[1] = $l->[1] - 1;
        my $lower   = 1;    # lace_lower is always 1
        my $len     = $l->[1] - $l->[0] + 1;    # map_len of [$l->[0], $l->[1], $l->[0]+1]
        my $is_long = (
                 $l->[0] > $max_upper
              or $lower > $max_lower
              or $lower < $min_lower
              or $len > $max_len
        );
        if ($is_long) {
            my $CASELACE = $use_bitfields ? "CASELACEL" : "CASELACE";
            $casel_min = $l unless @CASEL;
            $casel_max = $l
              if !defined($casel_max)
              or $casel_max->[0] < $l->[0];
            my $spc = " " x 9;
            my $cmt = sprintf(
                "%s/* '%c'->'%c' {, %d, %u} */",
                $spc,   $l->[0], $l->[1] + 1,
                $lower, $len
            );
            push @CASEL,
              sprintf( "    $CASELACE(0x%05x, 0x%05x),%s\n",
                $l->[0], $l->[1], $cmt );
            push @CASEL_RAW, [$l->[0], $lower, $len, $CASEL[-1]];
        }
        else {
            $case_min = $l unless @CASE;
            $case_max = $l if $case_max->[0] < $l->[0];
            my $spc = " " x 8;
            my $cmt = sprintf( "%s/* '%c'->'%c' {, %d, %u} */",
                " " x 8, $l->[0], $l->[1], $lower, $len );
            push @CASE,
              sprintf( "    CASELACE(0x%04x, 0x%04x),%s\n",
                $l->[0], $l->[1], $cmt );
            push @CASE_RAW, [$l->[0], $lower, $len, $CASE[-1]];
        }
    }
}

print FH <<"EOF";
static const struct casemaps_s {
    $f_upper
EOF
print FH <<"EOF" if $use_bitfields;
    $f_sign
EOF
printf FH <<"EOF", scalar @CASE, $case_min->[0], $case_max->[0];
    $f_lower
    $f_len
} casemaps[%u] = {
    /* upper: 0x%x - 0x%x */
    /* from, until, to */
EOF
print FH $_ for @CASE;
printf FH <<"EOF", scalar @CASEL, $casel_min->[0], $casel_max->[0];
};
static const struct casemapsl_s {
    uint32_t upper; /* base */
    int lower;      /* distance from upper to lower. 1 with LACE */
    uint16_t len;   /* how many */
} casemapsl[%u] = {
    /* upper: 0x%x - 0x%x */
    /* from, until, to */
EOF
print FH $_ for @CASEL;
print FH "};\n";

my (
    $MAP_FIRST,    $MAPL_FIRST, $MAP_LAST,
    $MAP_LAST_LEN, $MAPL_LAST,  $MAPL_LAST_LEN,
    $TARGET_FIRST, $TARGETL_FIRST, $TARGET_LAST, $TARGETL_LAST
);
if ($bsearch || $bsearch_both) {
    $MAP_FIRST  = $case_min->[0];
    $MAPL_FIRST = $casel_min->[0];
    $MAP_LAST   = $case_max->[0];
    $MAP_LAST_LEN =
      @$case_max == 3 ? map_len($case_max) : $case_max->[1] - $case_max->[0];
    $MAPL_LAST = $casel_max->[0];
    $MAPL_LAST_LEN =
      @$casel_max == 3
      ? map_len($casel_max)
      : $casel_max->[1] - $casel_max->[0];
}
if ($bsearch_both) {
    $bsearch = 1;    # --bsearch-both implies --bsearch
    # Sort casemaps raw data by target (upper + lower) for upper binary search
    my @case_by_target = sort { ($a->[0] + $a->[1]) <=> ($b->[0] + $b->[1]) } @CASE_RAW;
    my @casel_by_target = sort { ($a->[0] + $a->[1]) <=> ($b->[0] + $b->[1]) } @CASEL_RAW;
    $TARGET_FIRST  = $case_by_target[0][0]  + $case_by_target[0][1];
    $TARGET_LAST   = $case_by_target[-1][0] + $case_by_target[-1][1] + $case_by_target[-1][2] - 1;
    $TARGETL_FIRST = $casel_by_target[0][0]  + $casel_by_target[0][1];
    $TARGETL_LAST  = $casel_by_target[-1][0] + $casel_by_target[-1][1] + $casel_by_target[-1][2] - 1;
}

# The DZ digraph pair [0x1F1, 0x1F3] already handles both directions:
# towlower(0x1F1) finds pairs[i][0]==0x1F1, returns pairs[i][1]=0x1F3
# towupper(0x1F3) finds pairs[i][1]==0x1F3, returns pairs[i][0]=0x1F1
# No extra reverse pair needed (same as 01C4/01CA digraphs).

# Check if any CASELACE (lower==1) entry in casemapsl covers the
# titlecase digraph codepoints (0x1C4-0x1F3).  With custom --bits the
# digraphs may land in either casemaps or casemapsl; only emit the
# lace exception in the loop(s) that actually contain them.
my $case_has_digraph_lace = 0;
for my $r (@CASE_RAW) {
    next unless $r->[1] == 1;    # only lace entries
    my $end = $r->[0] + $r->[2] - 1;
    if ( $r->[0] <= 0x1F3 && $end >= 0x1C4 ) {
        $case_has_digraph_lace = 1;
        last;
    }
}
my $casel_has_digraph_lace = 0;
for my $r (@CASEL_RAW) {
    next unless $r->[1] == 1;    # only lace entries
    my $end = $r->[0] + $r->[2] - 1;
    if ( $r->[0] <= 0x1F3 && $end >= 0x1C4 ) {
        $casel_has_digraph_lace = 1;
        last;
    }
}

@PAIR  = sort { $a->[0] <=> $b->[0] } @pair;
@PAIRL = grep { $_->[0] > 0xffff } @PAIR;
@PAIR  = grep { $_->[0] <= 0xffff } @PAIR;

printf FH "\nstatic const unsigned short pairs[%u][2] = {\n"
  . "    /* upper: 0x%x - 0x%x */\n"
  . "    /* upper, lower */\n", scalar @PAIR, $PAIR[0][0], $PAIR[-1][0];
for my $p (@PAIR) {
    my $cmt = sprintf( " /* '%c' -> '%c' */", $p->[0], $p->[1] );
    printf FH "    {0x%04x, 0x%04x},%s\n", $p->[0], $p->[1], $cmt;
}
print FH "};\n";
if (@PAIRL) {
    if ($bsearch) {
        print FH "#define HAVE_PAIRL\n";
        printf FH "#define PAIRL_SZ %u\n", scalar @PAIRL;
    }
    printf FH "static const unsigned int pairl[%u][2] = {\n"
      . "    /* upper: 0x%x - 0x%x */\n"
      . "    /* upper, lower */\n", scalar @PAIRL, $PAIRL[0][0], $PAIRL[-1][0];
    for my $p (@PAIRL) {
        my $cmt = sprintf( " /* '%c' -> '%c' */", $p->[0], $p->[1] );
        printf FH "    {0x%05x, 0x%05x},%s\n", $p->[0], $p->[1], $cmt;
    }
    print FH "};\n";
}
# Create reverse-sorted index tables for upper (target) lookups for --bsearch-both.
# Instead of duplicating the full tables, store only indices into the lower tables.
# Choose the smallest index type that fits the array size.
sub idx_type { $_[0] <= 256 ? "uint8_t" : "uint16_t" }

my (@PAIR_UPPER, @PAIRL_UPPER);
if ($bsearch_both) {
    # Build a lookup from [upper,lower] pair to index in @PAIR
    my %pair_idx;
    for my $pi (0 .. $#PAIR) {
        $pair_idx{$PAIR[$pi][0]}{$PAIR[$pi][1]} = $pi;
    }
    my %pairl_idx;
    for my $pi (0 .. $#PAIRL) {
        $pairl_idx{$PAIRL[$pi][0]}{$PAIRL[$pi][1]} = $pi;
    }

    @PAIR_UPPER  = sort { $a->[1] <=> $b->[1] } grep { $_->[0] <= 0xffff && $_->[1] <= 0xffff } @pair;
    @PAIRL_UPPER = sort { $a->[1] <=> $b->[1] } grep { $_->[0] > 0xffff || $_->[1] > 0xffff } @pair;

    my $pair_idx_t  = idx_type(scalar @PAIR);
    my $pairl_idx_t = idx_type(scalar @PAIRL);
    my $case_idx_t  = idx_type(scalar @CASE);
    my $casel_idx_t = idx_type(scalar @CASEL);

    if (@PAIR_UPPER) {
        printf FH "\n/* indices into pairs[], sorted by lower (target) */\n";
        printf FH "static const %s pairs_upper[%u] = {\n"
          . "    /* lower: 0x%x - 0x%x */\n", $pair_idx_t, scalar @PAIR_UPPER, $PAIR_UPPER[0][1], $PAIR_UPPER[-1][1];
        for my $p (@PAIR_UPPER) {
            my $idx = $pair_idx{$p->[0]}{$p->[1]};
            my $cmt = sprintf( " /* [%u] '%c' <- '%c' */", $idx, $p->[0], $p->[1] );
            printf FH "    %u,%s\n", $idx, $cmt;
        }
        print FH "};\n";
    }
    if (@PAIRL_UPPER) {
        printf FH "/* indices into pairl[], sorted by lower (target) */\n";
        printf FH "static const %s pairl_upper[%u] = {\n"
          . "    /* lower: 0x%x - 0x%x */\n", $pairl_idx_t, scalar @PAIRL_UPPER, $PAIRL_UPPER[0][1], $PAIRL_UPPER[-1][1];
        for my $p (@PAIRL_UPPER) {
            my $idx = $pairl_idx{$p->[0]}{$p->[1]};
            my $cmt = sprintf( " /* [%u] '%c' <- '%c' */", $idx, $p->[0], $p->[1] );
            printf FH "    %u,%s\n", $idx, $cmt;
        }
        print FH "};\n";
    }
    # Generate casemaps_upper[] as index array sorted by target for upper binary search
    # Assign original indices to @CASE_RAW entries
    my @case_indexed;
    for my $ci (0 .. $#CASE_RAW) {
        push @case_indexed, [$ci, @{$CASE_RAW[$ci]}];
        # [orig_idx, upper, lower, len, str]
    }
    my @case_by_target = sort { ($a->[1] + $a->[2]) <=> ($b->[1] + $b->[2]) } @case_indexed;
    printf FH "/* indices into casemaps[], sorted by target (upper+lower) */\n";
    printf FH "static const %s casemaps_upper[%u] = {\n"
      . "    /* target: 0x%x - 0x%x */\n",
      $case_idx_t,
      scalar @case_by_target,
      $case_by_target[0][1] + $case_by_target[0][2],
      $case_by_target[-1][1] + $case_by_target[-1][2] + $case_by_target[-1][3] - 1;
    for my $r (@case_by_target) {
        my $target = $r->[1] + $r->[2];
        printf FH "    %u, /* [%u] target 0x%x */\n", $r->[0], $r->[0], $target;
    }
    print FH "};\n";

    my @casel_indexed;
    for my $ci (0 .. $#CASEL_RAW) {
        push @casel_indexed, [$ci, @{$CASEL_RAW[$ci]}];
    }
    my @casel_by_target = sort { ($a->[1] + $a->[2]) <=> ($b->[1] + $b->[2]) } @casel_indexed;
    printf FH "/* indices into casemapsl[], sorted by target (upper+lower) */\n";
    printf FH "static const %s casemapsl_upper[%u] = {\n"
      . "    /* target: 0x%x - 0x%x */\n",
      $casel_idx_t,
      scalar @casel_by_target,
      $casel_by_target[0][1] + $casel_by_target[0][2],
      $casel_by_target[-1][1] + $casel_by_target[-1][2] + $casel_by_target[-1][3] - 1;
    for my $r (@casel_by_target) {
        my $target = $r->[1] + $r->[2];
        printf FH "    %u, /* [%u] target 0x%x */\n", $r->[0], $r->[0], $target;
    }
    print FH "};\n";
}
print FH "\n";

printf FH "uint32_t %s(uint32_t wc, int lower) {
", $fn;
if ($bsearch) {
    print FH <<'EOF';
    int i;
    int lo, hi;
EOF
}
else {
    print FH <<'EOF';
    int i;
    int lmul;  /* 1 for lower, -1 for upper */
    int lmask; /* 0 for lower, -1/0xffff for upper */

EOF
}
if ($with_iswalpha) {    # if we have a working iswalpha (not with glibc)
    print FH
"    /* !iswalpha(wc) only works with musl. */\n";
    print FH "    if (!iswalpha(wc)\n";
}
else {
    # print larger exclusion ranges. iswalpha is useless with glibc
    print FH "    if (";
}

# generate the excl ranges here, because it is more stable
my %alpha;
$alpha{$_}++ for keys %lower;
$alpha{$_}++ for keys %upper;
my @sorted = sort { $a <=> $b } keys %alpha;
my $first  = $sorted[0];
my $last   = $sorted[-1];
@excl = ( [ 1, $first - 1 ] );

for ( $first + 0 .. $last - 1 ) {
    if ( !exists $alpha{$_} ) {
        add_excl( \@excl, $_ );
    }
}

# ternary_tree(1,6) => [4, [2, 1, 3], [6, 5]]
#sub ternary_tree {
#    my ($lo, $hi) = @_;
#    return undef if $lo > $hi;
#    my $mid = int(($lo + $hi + 1) / 2);
#    my @node = ($mid, ternary_tree($lo, $mid - 1), ternary_tree($mid + 1, $hi));
#    pop @node while @node && !defined $node[-1];
#    return @node == 1 ? $node[0] : \@node;
#}
#sub binary_search_indices {
#    my ($n) = @_;
#    return () if $n <= 1;
#
#    my @indices;
#    my @queue = ([1, $n - 1]);
#
#    while (@queue) {
#        my ($lo, $hi) = @{ shift @queue };
#        my $mid = int(($lo + $hi) / 2);
#        push @indices, $mid;
#
#        push @queue, [$lo, $mid - 1] if $lo < $mid;
#        push @queue, [$mid + 1, $hi] if $mid < $hi;
#    }
#
#    return @indices;
#}

# first purge all too small ranges
my @new_excl = ( $excl[0] );
for my $i ( 1 .. $#excl ) {
    my $e = $excl[$i];
    my $skip;

    if ( $e->[1] - $e->[0] >= $MIN_ECXL ) {

        # cross-check
        for ( $e->[0] .. $e->[1] ) {
            if ( exists $lower{$_} or exists $upper{$_} ) {
                warn( sprintf( "wrong excl %04X skipped", $_ ) );
                $skip = 1;
                last;
            }
        }
        push @new_excl, $e unless $skip;
    }
}

# sort ranges by size. GH #2
printf FH "wc <= 0x%x                           /* %u */\n", $excl[0]->[1],
  $excl[0]->[1] unless $with_iswalpha;
# upper bound: everything above the last cased codepoint
printf FH "        || wc > 0x%x                      /* >%u */\n",
  $last, $last;
shift @new_excl;
my @ex = sort { $b->[1] - $b->[0] <=> $a->[1] - $a->[0] } @new_excl;
for my $e (@ex) {
    my $s = sprintf( "        || wc - 0x%x <= 0x%x - 0x%x",
        $e->[0], $e->[1], $e->[0] );
    my $spc = 45 - length($s) > 0 ? " " x ( 45 - length($s) ) : " ";
    printf FH "%s%s/* %u */\n", $s, $spc, $e->[1] - $e->[0] + 1;
}
print FH "    )\n";
print FH "        return wc;\n";
print FH <<'EOF';

#ifdef HAVE_LOCALE_TR
    /* check for the 2 turkish mappings if we have a turkish locale. */
    if ((lower && (wc == 0x49 || wc == 0x130)) ||
        (!lower && (wc == 0x69 || wc == 0x131))) {
        const char *loc = setlocale(LC_CTYPE, NULL);
        if (loc && (!strncmp(loc, "tr", 2) || !strncmp(loc, "az", 2))) {
            if (lower) {
                if (wc == 0x49)
                    return 0x131;
                else
                    return 0x69;
            } else {
                if (wc == 0x69)
                    return 0x130;
                else
                    return 0x49;
            }
        }
    }
#endif

EOF

my $SZ_CASE  = scalar(@CASE);
my $SZ_CASEL = scalar(@CASEL);
my $SZ_PAIR  = scalar(@PAIR);
my $SZ_PAIRL = scalar(@PAIRL);
my $SZ_PAIR_UPPER  = scalar(@PAIR_UPPER);
my $SZ_PAIRL_UPPER = scalar(@PAIRL_UPPER);

if ($bsearch) {

#my $low_decl = $use_bitfields ? "        const int low = cm->sign ? -(cm->lower) : cm->lower;" : "";
#my $CASELOW = $use_bitfields ? "low" : "cm->lower";
    my $bits_ret =
      $use_bitfields
      ? "cm->sign ? wc - cm->lower : wc + cm->lower;\n"
        : "wc + cm->lower";
    # fast search lower pairl
    my $pairl_search = "";
    if ( $SZ_PAIRL == 1 ) {
        $pairl_search = <<'EOF';
        if (pairl[0][1 - lower] == wc)
            return pairl[0][lower];
EOF
    }
    elsif ( $SZ_PAIRL > 1 ) {
        $pairl_search = <<"EOF";
        for (i = 0; i < $SZ_PAIRL; i++) {
            assert(i > 0 ? pairl[i][0] >= pairl[i - 1][0] : 1);
            if (pairl[i][0] == wc)
                return pairl[i][1];
            if (pairl[i][0] > wc)
                break;
        }
EOF
    }
    print FH <<"EOF";
    /* binary search the casemaps ranges. */
    if (lower) {
        /* binary search lower only */
        if (wc <= $MAP_LAST + $MAP_LAST_LEN) {
            lo = 0;
            hi = $SZ_CASE - 1;
            while (lo <= hi) {
                /* avoids overflow vs. (lo+hi)/2 */
                const int mid = lo + (hi - lo) / 2;
                const struct casemaps_s *cm = &casemaps[mid];
                if (wc < cm->upper) // too low
                    hi = mid - 1;
                else if (wc - cm->upper < cm->len) { // in range
                    if (cm->lower == 1) {            // is LACE
EOF
        if ($case_has_digraph_lace) {
            print FH <<'EOF';
                        /* Need this exception (wrong lace for titlecase digraphs).
                           Tested from Unicode 4 to 18. */
                        if (wc == 0x1C4 || wc == 0x1CA || wc == 0x1F1)
                            return wc + 2;
EOF
        }
        print FH <<"EOF";
                        return wc + 1 - ((wc - cm->upper) & 1);
                    } else
                        return $bits_ret;
                } else // too high
                    lo = mid + 1;
            }
        }
        if (wc - $MAPL_FIRST <= ($MAPL_LAST + $MAPL_LAST_LEN) - $MAPL_FIRST) {
            lo = 0;
            hi = $SZ_CASEL - 1;
            while (lo <= hi) {
                const int mid = lo + (hi - lo) / 2;
                const struct casemapsl_s *cm = &casemapsl[mid];
                if (wc < cm->upper) // lower
                    hi = mid - 1;
                else if (wc - cm->upper < cm->len) { // in range
                    if (cm->lower == 1) {
EOF
        if ($casel_has_digraph_lace) {
            print FH <<'EOF';
                        if (wc == 0x1C4 || wc == 0x1CA || wc == 0x1F1)
                            return wc + 2;
EOF
        }
        print FH <<"EOF";
                        return wc + 1 - ((wc - cm->upper) & 1);
                    } else
                        return wc + cm->lower;
                } else // higher
                    lo = mid + 1;
            }
        }
        /* binary search pairs lower */
        lo = 0;
        hi = $SZ_PAIR - 1;
        while (lo <= hi) {
            const int mid = lo + (hi - lo) / 2;
            const unsigned short *p = (unsigned short *)&pairs[mid];
            if (*p == wc) {
                /* With duplicate [0] keys (extra towupper pairs), ensure we
                   return the first match which is the authoritative towlower. */
                int first = mid;
                while (first > 0 && pairs[first - 1][0] == wc)
                    first--;
                return pairs[first][1];
            }
            else if (*p < wc)
                lo = mid + 1;
            else
                hi = mid - 1;
        }
$pairl_search
    } else {
EOF
    if ($bsearch_both) {
        # binary search upper via index into lower tables
        my $pairl_upper_search = "";
        if ($SZ_PAIRL_UPPER == 1) {
            $pairl_upper_search = <<'EOF';
        {
            const unsigned int *p = (unsigned int *)&pairl[pairl_upper[0]];
            if (p[1] == wc)
                return p[0];
        }
EOF
        }
        elsif ($SZ_PAIRL_UPPER > 1) {
            $pairl_upper_search = <<"EOF";
        lo = 0;
        hi = $SZ_PAIRL_UPPER - 1;
        while (lo <= hi) {
            const int mid = lo + (hi - lo) / 2;
            const unsigned int *p = (unsigned int *)&pairl[pairl_upper[mid]];
            if (p[1] == wc)
                return p[0];
            else if (p[1] < wc)
                lo = mid + 1;
            else
                hi = mid - 1;
        }
EOF
        }
        my $pair_upper_search = "";
        if ($SZ_PAIR_UPPER == 1) {
            $pair_upper_search = <<'EOF';
        {
            const unsigned short *p = (unsigned short *)&pairs[pairs_upper[0]];
            if (p[1] == wc)
                return p[0];
        }
EOF
        }
        elsif ($SZ_PAIR_UPPER > 1) {
            $pair_upper_search = <<"EOF";
        /* binary search upper pairs via index */
        lo = 0;
        hi = $SZ_PAIR_UPPER - 1;
        while (lo <= hi) {
            const int mid = lo + (hi - lo) / 2;
            const unsigned short *p = (unsigned short *)&pairs[pairs_upper[mid]];
            if (p[1] == wc) {
                /* With duplicate [1] keys (e.g. titlecase digraphs), find the
                   entry with the smallest [0] (true uppercase, not titlecase). */
                unsigned short best = p[0];
                int j = mid - 1;
                while (j >= 0) {
                    const unsigned short *q = (unsigned short *)&pairs[pairs_upper[j]];
                    if (q[1] != wc) break;
                    if (q[0] < best) best = q[0];
                    j--;
                }
                j = mid + 1;
                while (j < $SZ_PAIR_UPPER) {
                    const unsigned short *q = (unsigned short *)&pairs[pairs_upper[j]];
                    if (q[1] != wc) break;
                    if (q[0] < best) best = q[0];
                    j++;
                }
                return best;
            }
            else if (p[1] < wc)
                lo = mid + 1;
            else
                hi = mid - 1;
        }
EOF
        }
        print FH <<"EOF";
        /* binary search upper via index: casemaps_upper */
        if (wc >= $TARGET_FIRST && wc <= $TARGET_LAST) {
            lo = 0;
            hi = $SZ_CASE - 1;
            while (lo <= hi) {
                const int mid = lo + (hi - lo) / 2;
                const struct casemaps_s *cm = &casemaps[casemaps_upper[mid]];
                uint32_t target = cm->upper + cm->lower;
                uint32_t target_end = target + cm->len - 1;
                if (wc < target)
                    hi = mid - 1;
                else if (wc > target_end)
                    lo = mid + 1;
                else {
                    if (cm->lower == 1) {
EOF
        if ($case_has_digraph_lace) {
            print FH <<'EOF';
                        /* Need this exception (wrong lace for titlecase digraphs).
                           Tested from Unicode 4 to 18. We search pairs later. */
                        if (wc == 0x1F3 || wc == 0x1CC || wc == 0x1C6)
                            return wc - 2;
EOF
        }
        print FH <<"EOF";
                        return wc - ((wc - cm->upper) & 1);
                    } else {
                        return wc - cm->lower;
                    }
                }
            }
        }
        /* binary search upper via index: casemapsl_upper */
        if (wc >= $TARGETL_FIRST && wc <= $TARGETL_LAST) {
            lo = 0;
            hi = $SZ_CASEL - 1;
            while (lo <= hi) {
                const int mid = lo + (hi - lo) / 2;
                const struct casemapsl_s *cm = &casemapsl[casemapsl_upper[mid]];
                uint32_t target = cm->upper + cm->lower;
                uint32_t target_end = target + cm->len - 1;
                if (wc < target)
                    hi = mid - 1;
                else if (wc > target_end)
                    lo = mid + 1;
                else {
                    if (cm->lower == 1) {
EOF
        if ($casel_has_digraph_lace) {
            print FH <<'EOF';
                        if (wc == 0x1F3 || wc == 0x1CC || wc == 0x1C6)
                            return wc - 2;
EOF
        }
        print FH <<"EOF";
                        return wc - ((wc - cm->upper) & 1);
                    } else
                        return wc - cm->lower;
                }
            }
        }
$pair_upper_search
$pairl_upper_search
EOF
    }
    else {
        # --bsearch only: linear search upper
        my $low_decl =
          $use_bitfields
          ? "        const int low = casemaps[i].sign ? -(casemaps[i].lower) : casemaps[i].lower;"
          : "";
        my $CASELOW      = $use_bitfields ? "low" : "casemaps[i].lower";
        $pairl_search = "";
        if ( $SZ_PAIRL == 1 ) {
            $pairl_search = <<'EOF';
        if (pairl[0][1 - lower] == wc)
            return pairl[0][lower];
EOF
        }
        elsif ( $SZ_PAIRL > 1 ) {
            # upper only
            $pairl_search = <<"EOF";
        for (i = 0; i < $SZ_PAIRL; i++) {
            assert(i > 0 ? pairl[i][0] >= pairl[i - 1][0] : 1);
            if (pairl[i][1] == wc)
                return pairl[i][0];
        }
EOF
        }
        print FH <<"EOF";
        /* linear search upper */
        for (i = 0; i < $SZ_CASE; i++) {
$low_decl
            int base = casemaps[i].upper + $CASELOW;
            assert(i > 0 ? casemaps[i].upper >= casemaps[i - 1].upper : 1);
            if (wc - base < casemaps[i].len) {
                if (casemaps[i].lower == 1) {
EOF
        if ($case_has_digraph_lace) {
            print FH <<'EOF';
                    /* Need this exception (wrong lace for titlecase digraphs).
                       Tested from Unicode 4 to 18. We search pairs later. */
                    if (wc == 0x1F3 || wc == 0x1CC || wc == 0x1C6)
                        return wc - 2;
EOF
        }
        print FH <<"EOF";
                    return wc - ((wc - casemaps[i].upper) & 1);
                } else {
                    return wc - $CASELOW;
                }
            }
        }
        for (i = 0; i < $SZ_CASEL; i++) {
            unsigned long base = casemapsl[i].upper + casemapsl[i].lower;
            assert(i > 0 ? casemapsl[i].upper >= casemapsl[i - 1].upper : 1);
            if (wc - base < casemapsl[i].len) {
                if (casemapsl[i].lower == 1) {
EOF
        if ($casel_has_digraph_lace) {
            print FH <<'EOF';
                    /* Need this exception (wrong lace for titlecase digraphs).
                       Tested from Unicode 4 to 18. */
                    if (wc == 0x1F3 || wc == 0x1CC || wc == 0x1C6)
                        return wc - 2;
EOF
        }
        print FH <<"EOF";
                    return wc - ((wc - casemapsl[i].upper) & 1);
                } else {
                    return wc - casemapsl[i].lower;
                }
            }
        }
        /* upper: full linear search */
        for (i = 0; i < $SZ_PAIR; i++) {
            assert(i > 0 ? pairs[i][0] >= pairs[i - 1][0] : 1);
            if (pairs[i][1] == wc)
                return pairs[i][0];
        }
$pairl_search
EOF
    }
    print FH <<'EOF';
    }
    return wc;
}

#undef CASEMAP
#undef CASELACE
EOF
}
else {    # no bsearch
    my $low_decl =
      $use_bitfields
      ? "        const int low = casemaps[i].sign ? -(casemaps[i].lower) : casemaps[i].lower;"
      : "";
    my $CASELOW = $use_bitfields ? "low" : "casemaps[i].lower";
    print FH <<"EOF";
    lmul = 2 * lower - 1; /* 1 for lower, -1 for upper */
    lmask = lower - 1;    /* 0 for lower, -1/0xffff for upper */
    /* linear search both */
    for (i = 0; i < $SZ_CASE; i++) {
$low_decl
        int base = casemaps[i].upper + (lmask & $CASELOW);
        assert(i > 0 ? casemaps[i].upper >= casemaps[i - 1].upper : 1);
        if (wc - base < casemaps[i].len) {
            if (casemaps[i].lower == 1) {
EOF

    if ($case_has_digraph_lace) {
        print FH <<"EOF";
                /* Need this exception (wrong lace for titlecase digraphs).
                   Tested from Unicode 4 to 18. We search pairs later. */
                if (!lower && (wc == 0x1F3 || wc == 0x1CC || wc == 0x1C6))
                    return wc - 2;
                if (lower && (wc == 0x1F1 || wc == 0x1CA || wc == 0x1C4))
                    return wc + 2;
EOF
    }
    print FH <<"EOF";
                return wc + lower - ((wc - casemaps[i].upper) & 1);
            } else {
                return wc + lmul * $CASELOW;
            }
        }
        if (lower && casemaps[i].upper > wc)
            break;
    }
    for (i = 0; i < $SZ_CASEL; i++) {
        unsigned long base = casemapsl[i].upper + (lmask & casemapsl[i].lower);
        assert(i > 0 ? casemapsl[i].upper >= casemapsl[i - 1].upper : 1);
        if (wc - base < casemapsl[i].len) {
            if (casemapsl[i].lower == 1) {
EOF

    if ($casel_has_digraph_lace) {
        print FH <<"EOF";
                if (!lower && (wc == 0x1F3 || wc == 0x1CC || wc == 0x1C6))
                    return wc - 2;
                if (lower && (wc == 0x1F1 || wc == 0x1CA || wc == 0x1C4))
                    return wc + 2;
EOF
    }
    print FH <<"EOF";
                return wc + lower - ((wc - casemapsl[i].upper) & 1);
            }
            return wc + lmul * casemapsl[i].lower;
        }
        if (lower && casemapsl[i].upper > wc)
            break;
    }
    for (i = 0; i < $SZ_PAIR; i++) {
        assert(i > 0 ? pairs[i][0] >= pairs[i - 1][0] : 1);
        if (pairs[i][1 - lower] == wc)
            return pairs[i][lower];
        if (lower && pairs[i][0] > wc)
            break;
    }
EOF
    if ( $SZ_PAIRL == 1 ) {
        print FH <<'EOF';
    if (pairl[0][1 - lower] == wc)
        return pairl[0][lower];
EOF
    }
    elsif ( $SZ_PAIRL > 1 ) {
        print FH <<"EOF";
    for (i = 0; i < $SZ_PAIRL; i++) {
        assert(i > 0 ? pairl[i][0] >= pairl[i - 1][0] : 1);
        if (pairl[i][1 - lower] == wc)
            return pairl[i][lower];
        if (lower && pairl[i][0] > wc)
            break;
    }
EOF
    }

    print FH <<'EOF';
    return wc;
}

#undef CASEMAP
#undef CASELACE
EOF
}    # !$bsearch

if ($use_bitfields) {
    print FH <<'EOF';
#undef CASEMAPL
#undef CASELACEL
EOF
}

if ($musl) {

    print FH <<"EOF";

wint_t towupper(wint_t wc)
{
	return (unsigned)wc < 128 ? toupper(wc) : $fn(wc, 0);
}

wint_t towlower(wint_t wc)
{
	return (unsigned)wc < 128 ? tolower(wc) : $fn(wc, 1);
}

wint_t __towupper_l(wint_t c, locale_t l)
{
	return towupper(c);
}

wint_t __towlower_l(wint_t c, locale_t l)
{
	return towlower(c);
}

weak_alias(__towupper_l, towupper_l);
weak_alias(__towlower_l, towlower_l);
EOF
}
elsif ($safec) {

    print FH <<"EOF";

EXPORT uint32_t _towupper(uint32_t wc) {
    return wc < 128 ? (uint32_t)toupper(wc) : $fn(wc, 0);
}
#ifndef HAVE_TOWUPPER
EXPORT uint32_t towupper(uint32_t wc) {
    return wc < 128 ? (uint32_t)toupper(wc) : $fn(wc, 0);
}
#endif

#ifndef HAVE_TOWLOWER
EXPORT uint32_t towlower(uint32_t wc) {
    return (unsigned)wc < 128 ? (wint_t)tolower(wc) : $fn(wc, 1);
}
#endif
EOF
}

close FH;
chmod 0444, $out;

__END__

=head1 NAME

Unicode::Towctrans - Generate small case mapping tables

=head1 SYNOPSIS

    gen_wctrans
    gen_wctrans --safec
    gen_wctrans --musl
    gen_wctrans -v 15
    gen_wctrans -v 15 --ud UnicodeData.txt.15 --out towctrans-15.h
    gen_wctrans --lower16
    gen_wctrans --fn __towcase
    gen_wctrans --min-excl 10000
    gen_wctrans --bits 18:14:10
    gen_wctrans --lower16
    gen_wctrans --bsearch
    gen_wctrans --bsearch-both
    gen_wctrans --table

=head1 DESCRIPTION

F<gen_wctrans> generates a F<towctrans.h> header file, which is used by C<musl>
and C<safeclib> to generate small and efficient case mapping tables, to
build the libc C<towupper()> and C<towlower()> functions and its secure
variants C<towupper_s()> and C<towlower_s()>.

If the code may run on a system with the turkish or azeri locale, you
need to define C<-DHAVE_LOCALE_TR> to check for the special turkish i
locale and mappings at run-time.

If you know that your C<iswalpha()> works correctly (only with musl),
then use C<--with_iswalpha> to get a lightly faster function. E.g. for
benchmarking.

With C<--lower16> it creates larger and more C<casemaps> tables, with less
long C<casemapl> tables. Thus it finds those ranges earlier, at the cost of
more caches misses. Currently C<--lower16> is the best performance and size
combination. For C<--bits> the fastest are 18:14:10 and 12:12:8, the smallest
is the default 16:8:8.

With C<--bsearch> the tolower check is done with a binary search, the
toupper check does a linear search without early exit. It needs more
space, and its performance is not that good as with C<--lower16>.

With C<--bsearch-both> the speed is faster and the size is even
bigger, as we have to store the order of the upper maps and pairs also
to be able to binary search it.

With C<--table>, the musl-new style, the size is much bigger, as we
have to store mappings for all blocks. The lookup is much faster
though.

Planned also for the multi-byte folding tables for C<wcsfc_s()> for
safeclib. As the single-byte C<towupper> and C<towlower> conversions
are meaningless for many multi-byte unicode mappings, those with
status B<F> - full folding. Use a full string foldcasing function instead,
as safeclib C<wcsfc_s>, ICU C<u_strToUpper> or libunistring C<uc_toupper>.

=head1 PERFORMANCE

Currently it is still a bit un-optimized, but small and fast enough
compared to the other implementations. And esp. correct compared to glibc,
which ignores characters from other locales.

The bench uses Unicode 10.0 data (C<-v 10>) so that our tables match
the Unicode version compiled into musl-old.  Benchmark errors fall
into three categories, none of which are bugs in our code:

=over 4

=item Circled letters 0x24B6-0x24E9 (affects musl-old, 52 diffs)

Our code correctly maps these per UnicodeData.txt (e.g.
C<towupper(0x24D0)=0x24B6>).  musl-old does not map them at all.

=item Georgian Mtavruli 0x1C90-0x1CBF (affects musl-new, 96 diffs)

These uppercase Georgian letters were added in Unicode 11.0.  musl-new
includes them, but our Unicode 10.0 bench tables do not, so musl-new
reports differences for every Mtavruli codepoint.

=item Post-Unicode-10.0 additions (affects musl-new, 16+ diffs)

Additional cased characters introduced after Unicode 10.0 (Osage,
Adlam, etc.) are present in musl-new but absent from our Unicode 10.0
tables.

=item glibc errors

glibc errors are caused by glibc ignoring cased characters from
non-latin locales entirely.

=back

    make -C examples
    ./bench
                my:        552 [us]  100.00 %
           my_excl:        595 [us]   92.77 %
          my_low16:        594 [us]   92.93 %
           my_bits:        571 [us]   96.67 %
        my_bsearch:        477 [us]  115.72 %
       my_bsearchb:        556 [us]   99.28 %
          my_table:        257 [us]  214.79 %
          musl-new:        209 [us]  264.11 %	9 errors
          musl-old:       1406 [us]   39.26 %	3 errors
             glibc:        149 [us]  370.47 %	15 errors

    wc -c towctrans-*.o
      3528 towctrans-my.o
      3608 towctrans-myexcl.o
      3632 towctrans-mylow16.o
      3920 towctrans-mybits.o
      3968 towctrans-mybsearch.o
      4864 towctrans-mybsearch-both.o
      6816 towctrans-mytable.o
      6848 towctrans-musl-new.o
      3464 towctrans-musl-old.o
     97440 towctrans-glibc.o

Results with more various C<--bits> size combinations. They need just some
logical fixups for the 5 errors.

C<--bits 16:10:8> and C<--bits 12:12:8> being the most promising,
the best being twice as fast as the default.

     ./bench-bits.sh
          16:8:8:        251 [us] 100.0 % 67 21 142 0 6
         16:16:8:        125 [us] 200.8 % 76 12 142 0 6
         16:10:8:        119 [us] 210.9 % 67 21 142 0 6
        18:14:10:        118 [us] 212.7 % 85 3 142 0 6      5 errors
         18:14:8:        138 [us] 181.9 % 85 3 142 0 6      5 errors
        18:12:10:        120 [us] 209.2 % 81 7 142 0 6      5 errors
         18:12:8:        180 [us] 139.4 % 81 7 142 0 6      5 errors
         16:12:6:        193 [us] 130.1 % 67 21 142 0 6     5 errors
         16:10:6:        133 [us] 188.7 % 67 21 142 0 6     5 errors
         14:10:8:        127 [us] 197.6 % 58 30 142 0 6     5 errors
         14:12:6:        135 [us] 185.9 % 54 34 142 0 6     5 errors
         12:12:8:        119 [us] 210.9 % 34 54 142 0 6     5 errors

     4880 towctrans-bmy.o (16:8:8)
     5024 towctrans-bmylow16.o (16:16:8)
     5232 towctrans-bmybits.o (16:10:8)
     5408 bits-12_12_8.o
     5312 bits-14_12_6.o
     5312 bits-14_10_8.o
     5256 bits-16_10_6.o
     5256 bits-16_12_6.o
     5176 bits-18_12_8.o
     5208 bits-18_12_10.o
     5208 bits-18_14_8.o
     5240 bits-18_14_10.o

=head1 INSTALLATION

Perl 5.12 or later is required.

This module does not need to be installed. Running gen_wctrans is enough.
However for full testing and global installation run this:

   perl Makefile.PL
   make
   make test
   make test-all
   sudo make install

=head1 DEPENDENCIES

This module requires a UnicodeData.txt file from Unicode Character
Database, which is automatically downloaded if missing.

=head1 AUTHOR

Reini Urban <rurban@cpan.org>

Copyright(C) 2026 Reini Urban. All rights reserved

=head1 COPYRIGHT AND LICENSE

This module is free software; you can redistribute it
and/or modify it under the same terms as Perl itself.

The generated files are MIT licensed. See the generated files headers.

=head1 SEE ALSO

=over 4

=item L<https://www.unicode.org/reports/tr44/#Casemapping>

=item L<https://git.musl-libc.org/cgit/musl/tree/src/ctype/towctrans.c>

=item L<https://git.musl-libc.org/cgit/musl/tree/src/ctype/towctrans.c?id=e8aba58ab19a18f83d7f78e80d5e4f51e7e4e8a9>

=item L<https://github.com/rurban/safeclib/blob/master/src/extwchar/towctrans.c>

=item L<https://sourceware.org/git/?p=glibc.git;a=tree;f=wctype;;hb=HEAD>

=back

=cut

# Local Variables:
# perl-indent-level: 4
# End:
