Unshy.xslt
Jump to navigation
Jump to search
A stylesheet to read in TEI (or other XML files) and remove line-end soft hyphens. See header comment for details.
<?xml version="1.0" encoding="UTF-8"?>
<!-- unshy.xslt -->
<!-- Reads in an XML file, writes out the same file with end-of-line soft hyphens -->
<!-- removed. Any whitespace between the soft hyphen and the next text node is -->
<!-- also removed. The markup between the soft hyphen and the next text node is -->
<!-- inserted immediately before the first whitespace character of the next text node -->
<!-- (or after it, if it has no internal whitespace). -->
<!-- Written 2008-07-24/26 by Syd Bauman -->
<!-- Updated 2008-08-07/08 by Syd Bauman: -->
<!-- bug fixes: -->
<!-- * incorrect conditional on last non-whitespace error message test -->
<!-- * skip all-whitespace nodes -->
<!-- * trim off leading whitespace, rather than completely normalize, so -->
<!-- that we don't nuke trailing space (sometimes it's important) -->
<!-- Copyleft 2008 Syd Bauman and the Brown University Women Writers Project -->
<!-- -->
<!-- Known issues (some would consider them bugs, others would say features): -->
<!-- * If there is more than one shy in a given text node, we end up -->
<!-- doing the wrong thing: deleting all the text after the *first* -->
<!-- shy, not the last. However, in this case we issue a warning -->
<!-- message, and since this case is never supposed to happen at the -->
<!-- WWP, we'd prefer the warning to better behavior. -->
<!-- * If the shy is not the last non-whitespace character, we end up -->
<!-- doing the wrong thing: deleting all the text after the shy, rather -->
<!-- than just any following whitespace. But again, this situation is -->
<!-- never supposed to happen at the WWP, so we'd prefer the warning to -->
<!-- doing the right thing. -->
<!-- * If the shy is the last text character in the document, it is silently -->
<!-- removed. It should probably be removed, but IMHO a warning would be a -->
<!-- good idea. -->
<!-- In truth, the right thing to do might be to validate that shys are plqced -->
<!-- appropiately with some other software (like a Schematron schema), and then -->
<!-- in this stylesheet test for shy only at end-of-line (except for whitespace). -->
<xsl:stylesheet version="1.0"
xmlns:xsl="http://www.w3.org/1999/XSL/Transform" >
<!-- housekeeping -->
<xsl:output encoding="UTF-8" omit-xml-declaration="no" method="xml"/>
<!-- start at root, match any children (i.e., root element, probably /TEI or /tei:TEI, -->
<!-- and any comments or PIs outside the root element) -->
<xsl:template match="/">
<xsl:apply-templates/>
</xsl:template>
<!-- For any node (element, attribute, text, PI, or comment) that is not -->
<!-- matched more specifically below, copy it to itself. Thus, except for -->
<!-- the templates below, this stylesheet is the identity transoform. -->
<xsl:template match="@*|node()">
<xsl:copy>
<xsl:apply-templates select="@*|node()"/>
</xsl:copy>
</xsl:template>
<!-- Subroutine that strips leading space off of a string, but, -->
<!-- unlike normalize-space(), leaves trailing and internal -->
<!-- space untouched. -->
<!-- Input: parameter "str", a string -->
<!-- Returns: value of "str" w/o leading space -->
<!-- Calls: itself -->
<xsl:template name="strip-leading-space">
<!-- first and only parameter is a string -->
<xsl:param name="str"/>
<!-- parse off first character to examine -->
<xsl:variable name="first" select="substring( $str, 1, 1 )"/>
<xsl:choose>
<!-- test 1st char to see if it is a whitespace char as defined by -->
<!-- the 'S' production of the XML specification. -->
<xsl:when test="$first=' ' or $first='	' or $first='
' or $first='
'">
<!-- yes: recursively call myself with the string parameter but with the -->
<!-- first character (which is whitespace) stripped off -->
<xsl:call-template name="strip-leading-space">
<xsl:with-param name="str" select="substring( $str, 2 )"/>
</xsl:call-template>
</xsl:when>
<xsl:otherwise>
<!-- no: then there is no leading whitespace, this is the string we wish to return -->
<xsl:value-of select="$str"/>
</xsl:otherwise>
</xsl:choose>
</xsl:template>
<!-- Match (or handle when called explicitly) any text node that contains -->
<!-- a soft hyphen. -->
<xsl:template match="text()[contains(.,'­')]" name="hasShy">
<!-- Should we be stripping off the first word of this text node? -->
<xsl:param name="strip1st">false</xsl:param> <!-- not unless asked -->
<!-- set the text to be processed (putting it in variable $me) -->
<xsl:variable name="me">
<xsl:choose>
<!-- if we've been asked to strip off the first word, -->
<xsl:when test="$strip1st = 'true'">
<!-- do so -->
<xsl:choose>
<!-- if there's a blank, -->
<xsl:when test="contains(.,' ')">
<!-- take everything after it -->
<xsl:value-of select="substring-after(normalize-space(.),' ')"/>
</xsl:when>
<xsl:otherwise><!-- return null if there only is one word --></xsl:otherwise>
</xsl:choose>
</xsl:when>
<!-- else just normalize space, just so things are handled evenly-->
<xsl:otherwise><xsl:value-of select="normalize-space(.)"/></xsl:otherwise>
</xsl:choose>
</xsl:variable>
<!-- Get the string content of the next text node (other than text nodes that -->
<!-- have nothing but whitespace) w/o its leading whitespace, if any. -->
<!-- (Put it in variable $nextText.) -->
<xsl:variable name="nextText">
<xsl:call-template name="strip-leading-space">
<xsl:with-param name="str" select="normalize-space(following-sibling::text()[not(normalize-space(.)='')][1])"/>
</xsl:call-template>
</xsl:variable>
<!-- some rudimentary error-checking -->
<xsl:choose>
<xsl:when test="contains(substring-after($me,'­'),'­')">
<xsl:message>Oh dear. More than 1 &shy; in this text sequence, I'm not going to get this one right.</xsl:message>
</xsl:when>
<xsl:when test="string-length( substring-after( $me,'­') ) > 0">
<xsl:message>Oh dear. This &shy; is not the last (non-whitespace) character of the text node. I'm probably going to mess this up.</xsl:message>
</xsl:when>
</xsl:choose>
<!-- output myself up to, but not including, shy -->
<xsl:value-of select="substring-before($me,'­')"/>
<!-- first token of next text node -->
<xsl:choose>
<!-- if there is a blank, -->
<xsl:when test="contains($nextText,' ')">
<!-- take chars up to it -->
<xsl:value-of select="substring-before($nextText,' ')"/>
</xsl:when>
<xsl:otherwise>
<!-- else just take entire text string -->
<xsl:value-of select="$nextText"/>
</xsl:otherwise>
</xsl:choose>
<!-- a newline to separate this word from the rest -->
<xsl:text>
</xsl:text>
</xsl:template>
<!-- Now match text nodes for which the preceding text node had a shy. -->
<!-- We want this template to be higher priority than 'hasShy', so that -->
<!-- when we hit a line that meets both criteria (has a shy, previous -->
<!-- text node had a shy), we come here first -->
<xsl:template match="text()[preceding-sibling::text()[not(normalize-space(.)='')][1][contains(.,'­')]]" priority="1">
<!-- preceding text node ended in shy, so it was caught by the hasShy template, -->
<!-- which grabbed and printed out the first token of this text node. So our -->
<!-- goal is to print out the rest. But if this node also ends in a shy, we have -->
<!-- to process it accordingly, too. -->
<xsl:choose>
<xsl:when test="contains(.,'­')">
<xsl:call-template name="hasShy">
<xsl:with-param name="strip1st">true</xsl:with-param>
</xsl:call-template>
</xsl:when>
<xsl:otherwise>
<!-- create a copy of myself (remember, I'm a text node) without leading -->
<!-- whitespace, and with other whitespace characters converted to blank -->
<xsl:variable name="sans-leading">
<xsl:call-template name="strip-leading-space">
<xsl:with-param name="str"><xsl:value-of select="translate(.,'	
',' ')"/></xsl:with-param>
</xsl:call-template>
</xsl:variable>
<!-- return everything after the first whitespace character -->
<!-- (now that we've removed leading spaces, that means everything -->
<!-- except the first token) -->
<xsl:value-of select="substring-after( $sans-leading,' ')"/>
</xsl:otherwise>
</xsl:choose>
</xsl:template>
</xsl:stylesheet>
I make no claim that this is even a good, let alone the best, way to do this. It did work on my test files, though.