Difference between revisions of "Unicode normalization"
Jump to navigation
Jump to search
(Created page with "There was [https://listserv.brown.edu/archives/cgi-bin/wa?A2=TEI-L;29cb6c10.1505 a post on TEI-L] that included a "Schematron rule (with companion xsl function) to locate and ide...") |
(Added Schematron, function code) |
||
Line 1: | Line 1: | ||
− | + | First [https://listserv.brown.edu/archives/cgi-bin/wa?A2=TEI-L;29cb6c10.1505 posted on TEI-L], the code below is a Schematron rule (with companion xsl function) to locate and identify all non-normalized Unicode characters, and to offer a quick fix to normalize it. | |
+ | |||
+ | Code must be part of a valid Schematron file. The prefix sqf must be bound to the namespace http://www.schematron-quickfix.com/validator/process. The prefix func can be bound to any namespace. | ||
+ | |||
+ | <code> | ||
+ | <rule context="text()"> | ||
+ | <let name="this-raw-char-seq" value="tokenize(replace(.,'(.)','$1'),' ')"/> | ||
+ | <let name="this-nfc-char-seq" value="tokenize(replace(normalize-unicode(.),'(.)','$1 '),' ')"/> | ||
+ | <let name="this-non-nfc-seq" value="distinct-values($this-raw-char-seq[not(.=$this-nfc-char-seq)])"/> | ||
+ | <assert test=". = normalize-unicode(.)" sqf:fix="normalize-unicode">All text needs to be | ||
+ | normalized (NFC). Errors: <value-of | ||
+ | select="for $i in $this-non-nfc-seq return concat($i,' (U+', | ||
+ | func:dec-to-hex(string-to-codepoints($i)),') at ', | ||
+ | string-join(for $j in index-of($this-raw-char-seq,$i) return string($j),' ')),' '" | ||
+ | /></assert> | ||
+ | <sqf:fix id="normalize-unicode"> | ||
+ | <sqf:description> | ||
+ | <sqf:title>Convert to normalized (NFC) Unicode</sqf:title> | ||
+ | </sqf:description> | ||
+ | <sqf:stringReplace match="." regex=".+"><value-of select="normalize-unicode(.)" | ||
+ | /></sqf:stringReplace> | ||
+ | </sqf:fix> | ||
+ | </rule> | ||
+ | <xsl:function name="func:dec-to-hex" as="xs:string" | ||
+ | xmlns:xsl="http://www.w3.org/1999/XSL/Transform"> | ||
+ | <!-- Input: Integer. Output: Hexadecimal equivalent string. --> | ||
+ | <xsl:param name="in" as="xs:integer"/> | ||
+ | <xsl:sequence | ||
+ | select="if ($in eq 0) | ||
+ | then '0' | ||
+ | else | ||
+ | concat(if ($in gt 16) | ||
+ | then func:dec-to-hex($in idiv 16) | ||
+ | else '', | ||
+ | substring('0123456789ABCDEF', | ||
+ | ($in mod 16) + 1, 1))" | ||
+ | /> | ||
+ | </xsl:function> | ||
+ | </code> | ||
[[Category:Schematron]] | [[Category:Schematron]] |
Latest revision as of 16:55, 14 May 2015
First posted on TEI-L, the code below is a Schematron rule (with companion xsl function) to locate and identify all non-normalized Unicode characters, and to offer a quick fix to normalize it.
Code must be part of a valid Schematron file. The prefix sqf must be bound to the namespace http://www.schematron-quickfix.com/validator/process. The prefix func can be bound to any namespace.
<rule context="text()">
<let name="this-raw-char-seq" value="tokenize(replace(.,'(.)','$1'),' ')"/>
<let name="this-nfc-char-seq" value="tokenize(replace(normalize-unicode(.),'(.)','$1 '),' ')"/>
<let name="this-non-nfc-seq" value="distinct-values($this-raw-char-seq[not(.=$this-nfc-char-seq)])"/>
<assert test=". = normalize-unicode(.)" sqf:fix="normalize-unicode">All text needs to be
normalized (NFC). Errors: <value-of
select="for $i in $this-non-nfc-seq return concat($i,' (U+',
func:dec-to-hex(string-to-codepoints($i)),') at ',
string-join(for $j in index-of($this-raw-char-seq,$i) return string($j),' ')),' '"
/></assert>
<sqf:fix id="normalize-unicode">
<sqf:description>
<sqf:title>Convert to normalized (NFC) Unicode</sqf:title>
</sqf:description>
<sqf:stringReplace match="." regex=".+"><value-of select="normalize-unicode(.)"
/></sqf:stringReplace>
</sqf:fix>
</rule>
<xsl:function name="func:dec-to-hex" as="xs:string"
xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
<xsl:param name="in" as="xs:integer"/>
<xsl:sequence
select="if ($in eq 0)
then '0'
else
concat(if ($in gt 16)
then func:dec-to-hex($in idiv 16)
else ,
substring('0123456789ABCDEF',
($in mod 16) + 1, 1))"
/>
</xsl:function>