karakul/NastaliqKerning.py

"""NastaliqKerning plugin

This plugin provides two FEZ verbs - `NastaliqKerning` and
`AtHeight`. These are related in that they compute the "height"
of a glyph sequence and chain to a different routine based on
that height. In the case of `NastaliqKerning`, we evaluate all
sequences beginning with an initial glyph, compute the height
of that sequence, and also create a kerning table for that height.
In `AtHeight`, we evaluate all sequences, and if they fall within
a specified height range, dispatch to an arbitrary routine. (This
is used for the dot avoidance code.)

The syntax of each verb is:

    NastaliqKerning <units> <percentage>%

Creates kern tables which bring together the initial and final
glyphs to within `units` units of one another or within the specified
percentage of the width of the final glyph.

You will want to read https://simoncozens.github.io/nastaliq-autokerning/
before trying to understand this code.

    AtHeight <units1>-<units2> <routine>

Chains to the given routine for all sequences of glyphs between
`units1` and `units2` high. The context of the chained routine will
be the final/isolate glyph of the previous sequence, a space (if there
is one), and the glyph sequence. For example, in the case of the
sequence `لو پجل`, the height will be computed as 400 units in the
case of Gulzar; if 400 is between `units1` and `units2`, then the
routine will be called with `VAOf1` as the start of the chained glyph
sequence.
"""
import logging
import sys
import warnings
from glob import glob
from itertools import product
import csv

import fontFeatures
import tqdm
from fez import FEZVerb
from fontTools.feaLib.variableScalar import VariableScalar
from glyphtools import bin_glyphs_by_metric
from kerndeterminer import KernDeterminer

PARSEOPTS = dict(use_helpers=True)

GRAMMAR = ""

NastaliqKerning_GRAMMAR = """
?start: action
action: (variable_scalar | integer_container) integer_container "%"
"""

AtHeight_GRAMMAR = """
?start: action
action: integer_container "-" integer_container BARENAME
"""

VERBS = ["NastaliqKerning", "AtHeight"]

logging.basicConfig(format="%(message)s")
logger = logging.getLogger("NastaliqKerning")
logger.setLevel(logging.WARN)

# Number of different "rise" groups, used in clustering the glyphs
# to determine the sequence height. This is O(n^2) in number of
# generated layout rules, so increasing this to 4 will overflow
# the font builder! 3 seems accurate enough for practical purposes.
ACCURACY1 = 3

# Controls the number of kern tables to be generated by rounding
# the computed height to this number of units. i.e. at baseline,
# at height of 100 units, at height of 200 units, etc. - up to...
RISE_QUANTIZATION = 100
# the maximum height we care about. All higher sequences will have
# the same kerning applied to them.
MAXIMUM_RISE = 600

# Rounding the kern values allows them to be stored more efficiency
# in the OpenType binary.
KERN_QUANTIZATION = 10

MINIMUM_KERN = -20  # If it's not less than this, don't bother

# Only consider sequences of this length - for longer sequences,
# we start counting height from the *medial* instead of the final.
MAXIMUM_WORD_LENGTH = 5


def taper_schedule(height):
    """For cross-space kerning (when there is a space between the two
    letters being kerned), we take a different approach, looking purely
    at the horizontal ink-to-ink distance between the glyphs. At the
    baseline, we keep the ink-to-ink distance equal to the target distance,
    but at the left glyph's height increases, there is a gap underneath it,
    and so we taper the kern based on this height to avoid the gap."""
    if height < 200:
        taper = 1.0
    elif height < 300:
        taper = 0.9
    elif height < 400:
        taper = 0.8
    elif height < 500:
        taper = 0.7
    else:
        taper = 0.6
    return taper


def quantize(number, degree):
    """Round off a number to the nearest ``degree``"""
    return degree * round(number / degree)


def zero_out_nonnegative_and_quantize(varkern, degree):
    """Takes a variable kern, weeds out (and quantizes) any non-negative
    kerns, and returns a variable kern if there's anything left"""
    new_values = {}
    for location, kern in varkern.values.items():
        if kern < -10:
            new_values[location] = quantize(kern, degree)
        else:
            new_values[location] = 0
    if any(v <= MINIMUM_KERN for v in new_values.values()):
        varkern.values = new_values
    else:
        varkern.values = {}
    return varkern


class NastaliqKerning(FEZVerb):
    """Kern a Nastaliq font."""

    def action(self, args):
        """Main entry point."""
        # Read the parameters
        self.distance_at_closest = args[0].resolve_as_integer()
        self.variable = True
        if not isinstance(self.distance_at_closest, VariableScalar):
            # Let's make it one
            self.variable = False
            new_distance = VariableScalar()
            new_distance.values[tuple(self.parser.font.default_master.location.items())] = self.distance_at_closest
            self.distance_at_closest = new_distance
        self.maxtuck = args[1].resolve_as_integer() / 100.0
        self.ink_to_ink_routines = {}

        self.kerner = KernDeterminer(glob("sources/build/*.glyphs")[0])

        # Read a few useful classes into Python variables.
        self.inits = self.parser.fontfeatures.namedClasses["inits"]
        medis = self.parser.fontfeatures.namedClasses["medis"]
        bariye = self.parser.fontfeatures.namedClasses["bariye"]
        self.isols = [
            x for x in self.parser.fontfeatures.namedClasses["isols"] if x not in bariye
        ]
        finas = [
            x for x in self.parser.fontfeatures.namedClasses["finas"] if x not in bariye
        ]

        self.isols_finas = list(set(self.isols + finas) | set(bariye))

        # These glyphs are special cased. We should probably read
        # `blockers` from a glyph class, really, instead of hard
        # coding it.
        blockers = ["AINf1", "JIMf1"]

        # Now we cluster the medials and finals based on their
        # rise.
        binned_medis = bin_glyphs_by_metric(
            self.parser.font, medis, "rise", bincount=ACCURACY1
        )
        binned_finas = bin_glyphs_by_metric(
            self.parser.font, finas, "rise", bincount=ACCURACY1
        )

        # This will hold kern tables for each rise value.
        self.kern_at_rise = {}
        routines = []

        # The main entry to our kerning routine. We ignore marks
        # and ligatures (spaces)
        routine = fontFeatures.Routine(name="NastaliqKerning")
        routine.flags = 0x04 | 0x08

        routines.append(routine)
        self.debug_csv = csv.writer(open("/tmp/debugkern.csv", "w"))

        # We will build our word sequences, from longest to shortest.
        # `i` will count medial and final glyphs, not including the
        # initial glyph, which is why we go down to zero.
        for i in range(MAXIMUM_WORD_LENGTH, -1, -1):
            postcontext_options = [binned_finas] + [binned_medis] * i
            warnings.warn("Length " + str(i))

            # This iterator returns all sequences of glyph groups.
            # For example, when `i` is 2 it will return
            #    binned_finas[0] binned_medis[0] binned_medis[0]
            #    binned_finas[0] binned_medis[0] binned_medis[1]
            #    binned_finas[0] binned_medis[0] binned_medis[2]
            #    binned_finas[0] binned_medis[1] binned_medis[0]
            #    binned_finas[0] binned_medis[1] binned_medis[1]
            #    ...
            #    binned_finas[2] binned_medis[2] binned_medis[2]
            all_options = product(*postcontext_options)

            for postcontext_plus_rise in all_options:
                # Each group is a two-element tuple: the glyphs in
                # the group and the median rise for each group. By
                # summing the second element of each group, we get
                # the height of this sequence.

                word_tail_rise = quantize(
                    sum(x[1] for x in postcontext_plus_rise), RISE_QUANTIZATION
                )
                if word_tail_rise < 0:
                    continue

                # And by reading the first element, we get the glyphs
                # involved.
                postcontext = list(reversed([x[0] for x in postcontext_plus_rise]))
                # warnings.warn("%s - %i" % (postcontext, word_tail_rise))
                if word_tail_rise >= MAXIMUM_RISE:
                    word_tail_rise = MAXIMUM_RISE
                    if i == MAXIMUM_WORD_LENGTH:
                        # Drop the fina, so that we match all sequence
                        # starting with these glyphs.
                        postcontext.pop()

                # The right hand side of our glyph pair
                target = [self.isols_finas]
                lookups = [[self.generate_kern_table_for_rise(word_tail_rise)]]

                # Are there any blocking final glyphs in this sequence?
                do_blockers = False
                if any(blocker in postcontext[-1] for blocker in blockers):
                    # If so, remove them from the group and handle them later;
                    # by unconditionally separating them from the group, we
                    # keep the groups constant across the whole lookup, which
                    # allows them to be represented as a format 2 lookup
                    # which is very efficient.
                    postcontext[-1] = list(set(postcontext[-1]) - set(blockers))
                    do_blockers = True

                # Call the appropriate kern table for this sequence
                routine.rules.append(
                    fontFeatures.Chaining(
                        target,
                        postcontext=[self.inits] + postcontext,
                        lookups=lookups,
                    )
                )

                # We now deal with blocking glyphs. If the sequence length
                # is 1 (init + final), skip it; otherwise, add it.
                if len(postcontext) > 1 and do_blockers:
                    postcontext[-1] = blockers
                    routine.rules.append(
                        fontFeatures.Chaining(
                            target,
                            postcontext=[self.inits] + postcontext,
                            lookups=lookups,
                        )
                    )

                if word_tail_rise >= 400 and i > 4:
                    # HACK
                    # This has to be done separately to make the classes work
                    postcontext[-1] = ["BARI_YEf1"]
                    routine.rules.append(
                        fontFeatures.Chaining(
                            target,
                            postcontext=[self.inits] + postcontext,
                            lookups=lookups,
                        )
                    )

        # Finally, kern isolates against each other.
        target = [self.isols_finas]
        lookups = [[self.generate_kern_table_for_rise(0)]]
        routine.rules.append(
            fontFeatures.Chaining(target, lookups=lookups, postcontext=[self.isols])
        )
        return routine.rules

    def master_for_location(self, location):
        masters = self.parser.font.masters
        this_master = [m for m in masters if tuple(m.location.items()) == location]
        if not this_master:
            raise ValueError(
                f"Could not find master for location {location};"
                f" master locations were: {[m.location for m in masters]}"
            )
        return this_master[0]

    def determine_kern_cached(self, glyph1, glyph2, height):
        maxtuck = self.maxtuck or 0.4
        # Determines the kern; the heavy lifting is done in
        # kerndeterminer, we just orchestrate the handling of variable
        # font masters
        variable_kern = VariableScalar()
        variable_kern.axes = self.parser.font.axes
        for location, targetdistance in self.distance_at_closest.values.items():
            master = self.master_for_location(location)
            kern = self.kerner.determine_kern(
                glyph1,
                glyph2,
                master.name.get_default(),
                targetdistance,
                height,
                maxtuck,
            )
            variable_kern.values[location] = kern
        return variable_kern

    def ink_to_ink_at(self, rise):
        if rise in self.ink_to_ink_routines:
            return self.ink_to_ink_routines[rise]

        # Taper distance based on rise to make it visually equal!
        taper = taper_schedule(rise)

        ink_to_ink = fontFeatures.Routine(f"ink_to_ink_{rise}", flags=0x8 | 0x4)
        font = self.parser.font

        for right in self.isols_finas:
            for left in self.inits + self.isols:
                kern = VariableScalar()
                kern.axes = font.axes
                for location, targetdistance in self.distance_at_closest.values.items():
                    master = self.master_for_location(location)
                    space_width = master.get_glyph_layer("space.urdu").width
                    right_of_left = max(master.get_glyph_layer(left).rsb, 0)
                    left_of_right = max(master.get_glyph_layer(right).lsb, 0)
                    dist = int(
                        (space_width * taper) - (right_of_left + left_of_right)
                    )
                    kern.values[location] = dist
                kern = zero_out_nonnegative_and_quantize(kern, KERN_QUANTIZATION)
                self.debug_csv.writerow(["ink_to_ink", rise, left, right, str(kern)])
                if kern.values:
                    if not self.variable:
                        kern = kern.default

                    ink_to_ink.rules.append(
                        fontFeatures.Positioning(
                            [[right], [left]],
                            [
                                fontFeatures.ValueRecord(),
                                fontFeatures.ValueRecord(xAdvance=kern),
                            ],
                        )
                    )
        self.ink_to_ink_routines[rise] = self.parser.fontfeatures.referenceRoutine(
            ink_to_ink
        )
        return self.ink_to_ink_routines[rise]

    def generate_kern_table_for_rise(self, rise):
        if rise in self.kern_at_rise:
            return self.kern_at_rise[rise]
        rise = quantize(rise, RISE_QUANTIZATION)
        kerntable = {}

        print(f"Generating table for rise {rise}", file=sys.stderr)
        # At the baseline, the left glyph of the sequence is all the
        # isolates and initials; but if there is a rise, we must
        # have seen a medial/final before it so we ignore the isolates.
        if rise > 0:
            ends = self.inits
        else:
            ends = self.inits + self.isols

        # So this is easy; we just go through every combination and
        # determine the kern.
        with tqdm.tqdm(total=len(ends) * len(self.isols_finas), miniters=30) as pbar:
            for end_of_previous_word in self.isols_finas:
                kerntable[end_of_previous_word] = {}
                for initial in sorted(
                    ends
                ):  # initial of "long" sequence, i.e. left glyph
                    logger.info("Left glyph: %s", initial)
                    logger.info("Right glyph: %s", end_of_previous_word)
                    kern = self.determine_kern_cached(
                        initial, end_of_previous_word, height=rise
                    )
                    logger.info(
                        "%s - %s @ %i : %s", initial, end_of_previous_word, rise, kern
                    )
                    # Only record a kern if we are actually bringing two glyphs closer.
                    kern = zero_out_nonnegative_and_quantize(kern, KERN_QUANTIZATION)
                    self.debug_csv.writerow(["kern", rise, initial, end_of_previous_word, str(kern)])
                    if kern.values:
                        if not self.variable:
                            kern = kern.default
                        kerntable[end_of_previous_word][initial] = kern
                    pbar.update(1)

        # Once we've done so, we stick it in a pair positioning routine.
        kernroutine = fontFeatures.Routine(
            rules=[],
            name=f"kern_at_{rise}",
        )
        kernroutine.flags = 0x08 | 0x04
        abovemarks = self.parser.fontfeatures.namedClasses["all_above_marks"]
        kernroutine.markFilteringSet = abovemarks

        for left, kerns in kerntable.items():
            for right, value in kerns.items():
                kernroutine.rules.append(
                    fontFeatures.Positioning(
                        [[left], [right]],
                        [
                            fontFeatures.ValueRecord(),
                            fontFeatures.ValueRecord(xAdvance=value),
                        ],
                    )
                )
        kernroutine = self.parser.fontfeatures.referenceRoutine(kernroutine)

        # This kern routine is going to dispatch differently depending on
        # a) height and b) whether or not there is a space.
        dispatch = fontFeatures.Routine(name=f"dispatch_{rise}", flags=0x8)
        ink_to_ink_routine = self.ink_to_ink_at(rise)
        if ink_to_ink_routine.routine.rules:
            dispatch.rules.append(
                fontFeatures.Chaining(
                    [self.isols_finas, ["space.urdu"], ends],
                    lookups=[[ink_to_ink_routine], [], []],
                )
            )
        if kernroutine.routine.rules:
            dispatch.rules.append(
                fontFeatures.Chaining(
                    [self.isols_finas, ends], lookups=[[kernroutine], [], []]
                )
            )
        self.kern_at_rise[rise] = dispatch
        return dispatch


# This is just a generic version of the above.
class AtHeight(FEZVerb):
    def action(self, args):
        (height_lower, height_upper, target_routine) = args
        height_lower = height_lower.resolve_as_integer()
        height_upper = height_upper.resolve_as_integer()
        target_routine = self.parser.fontfeatures.routineNamed(target_routine)
        self.inits = self.parser.fontfeatures.namedClasses["inits"]
        medis = self.parser.fontfeatures.namedClasses["medis"]
        isols = self.parser.fontfeatures.namedClasses["isols"]
        finas = self.parser.fontfeatures.namedClasses["finas"]

        self.isols_finas = isols + finas

        binned_medis = bin_glyphs_by_metric(
            self.parser.font, medis, "rise", bincount=ACCURACY1
        )
        binned_finas = bin_glyphs_by_metric(
            self.parser.font, finas, "rise", bincount=ACCURACY1
        )

        routine = fontFeatures.Routine(
            name=f"At_{height_lower}_{height_upper}_{target_routine.name}"
        )
        routine.flags = 0x04 | 0x08

        for i in range(MAXIMUM_WORD_LENGTH, -1, -1):
            postcontext_options = [binned_finas] + [binned_medis] * i
            all_options = product(*postcontext_options)
            for postcontext_plus_rise in all_options:
                word_tail_rise = quantize(
                    sum(x[1] for x in postcontext_plus_rise), RISE_QUANTIZATION
                )
                postcontext = list(reversed([x[0] for x in postcontext_plus_rise]))
                if word_tail_rise < height_lower or word_tail_rise > height_upper:
                    continue

                target = [self.isols_finas, self.inits]
                lookups = [[target_routine]] + [None] * (len(target) - 1)
                routine.rules.append(
                    fontFeatures.Chaining(
                        target,
                        postcontext=postcontext,
                        lookups=lookups,
                    )
                )
        return [routine]