Unicode normalization
Jump to navigation
Jump to search
First posted on TEI-L, the code below is a Schematron rule (with companion xsl function) to locate and identify all non-normalized Unicode characters, and to offer a quick fix to normalize it.
Code must be part of a valid Schematron file. The prefix sqf must be bound to the namespace http://www.schematron-quickfix.com/validator/process. The prefix func can be bound to any namespace.
<rule context="text()">
<let name="this-raw-char-seq" value="tokenize(replace(.,'(.)','$1'),' ')"/>
<let name="this-nfc-char-seq" value="tokenize(replace(normalize-unicode(.),'(.)','$1 '),' ')"/>
<let name="this-non-nfc-seq" value="distinct-values($this-raw-char-seq[not(.=$this-nfc-char-seq)])"/>
<assert test=". = normalize-unicode(.)" sqf:fix="normalize-unicode">All text needs to be
normalized (NFC). Errors: <value-of
select="for $i in $this-non-nfc-seq return concat($i,' (U+',
func:dec-to-hex(string-to-codepoints($i)),') at ',
string-join(for $j in index-of($this-raw-char-seq,$i) return string($j),' ')),' '"
/></assert>
<sqf:fix id="normalize-unicode">
<sqf:description>
<sqf:title>Convert to normalized (NFC) Unicode</sqf:title>
</sqf:description>
<sqf:stringReplace match="." regex=".+"><value-of select="normalize-unicode(.)"
/></sqf:stringReplace>
</sqf:fix>
</rule>
<xsl:function name="func:dec-to-hex" as="xs:string"
xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
<xsl:param name="in" as="xs:integer"/>
<xsl:sequence
select="if ($in eq 0)
then '0'
else
concat(if ($in gt 16)
then func:dec-to-hex($in idiv 16)
else ,
substring('0123456789ABCDEF',
($in mod 16) + 1, 1))"
/>
</xsl:function>