Difference between revisions of "Unshy.xslt"
Jump to navigation
Jump to search
(bug fixes) |
|||
| (2 intermediate revisions by the same user not shown) | |||
| Line 10: | Line 10: | ||
<!-- (or after it, if it has no internal whitespace). --> | <!-- (or after it, if it has no internal whitespace). --> | ||
<!-- Written 2008-07-24/26 by Syd Bauman --> | <!-- Written 2008-07-24/26 by Syd Bauman --> | ||
| + | <!-- Updated 2008-08-07/08 by Syd Bauman: --> | ||
| + | <!-- bug fixes: --> | ||
| + | <!-- * incorrect conditional on last non-whitespace error message test --> | ||
| + | <!-- * skip all-whitespace nodes --> | ||
| + | <!-- * trim off leading whitespace, rather than completely normalize, so --> | ||
| + | <!-- that we don't nuke trailing space (sometimes it's important) --> | ||
<!-- Copyleft 2008 Syd Bauman and the Brown University Women Writers Project --> | <!-- Copyleft 2008 Syd Bauman and the Brown University Women Writers Project --> | ||
<!-- --> | <!-- --> | ||
| − | <!-- Known issues (some would consider them bugs, | + | <!-- Known issues (some would consider them bugs, others would say features): --> |
| − | <!-- * If there | + | <!-- * If there is more than one shy in a given text node, we end up --> |
<!-- doing the wrong thing: deleting all the text after the *first* --> | <!-- doing the wrong thing: deleting all the text after the *first* --> | ||
<!-- shy, not the last. However, in this case we issue a warning --> | <!-- shy, not the last. However, in this case we issue a warning --> | ||
| Line 26: | Line 32: | ||
<!-- removed. It should probably be removed, but IMHO a warning would be a --> | <!-- removed. It should probably be removed, but IMHO a warning would be a --> | ||
<!-- good idea. --> | <!-- good idea. --> | ||
| − | <!-- In truth, the right thing to do might be to validate that shys are | + | <!-- In truth, the right thing to do might be to validate that shys are plqced --> |
<!-- appropiately with some other software (like a Schematron schema), and then --> | <!-- appropiately with some other software (like a Schematron schema), and then --> | ||
<!-- in this stylesheet test for shy only at end-of-line (except for whitespace). --> | <!-- in this stylesheet test for shy only at end-of-line (except for whitespace). --> | ||
<xsl:stylesheet version="1.0" | <xsl:stylesheet version="1.0" | ||
| − | xmlns:xsl="http://www.w3.org/1999/XSL/Transform | + | xmlns:xsl="http://www.w3.org/1999/XSL/Transform" > |
| − | |||
<!-- housekeeping --> | <!-- housekeeping --> | ||
| Line 49: | Line 54: | ||
<xsl:apply-templates select="@*|node()"/> | <xsl:apply-templates select="@*|node()"/> | ||
</xsl:copy> | </xsl:copy> | ||
| + | </xsl:template> | ||
| + | |||
| + | <!-- Subroutine that strips leading space off of a string, but, --> | ||
| + | <!-- unlike normalize-space(), leaves trailing and internal --> | ||
| + | <!-- space untouched. --> | ||
| + | <!-- Input: parameter "str", a string --> | ||
| + | <!-- Returns: value of "str" w/o leading space --> | ||
| + | <!-- Calls: itself --> | ||
| + | <xsl:template name="strip-leading-space"> | ||
| + | <!-- first and only parameter is a string --> | ||
| + | <xsl:param name="str"/> | ||
| + | <!-- parse off first character to examine --> | ||
| + | <xsl:variable name="first" select="substring( $str, 1, 1 )"/> | ||
| + | <xsl:choose> | ||
| + | <!-- test 1st char to see if it is a whitespace char as defined by --> | ||
| + | <!-- the 'S' production of the XML specification. --> | ||
| + | <xsl:when test="$first='&#x20;' or $first='&#x09;' or $first='&#x0D;' or $first='&#x0A;'"> | ||
| + | <!-- yes: recursively call myself with the string parameter but with the --> | ||
| + | <!-- first character (which is whitespace) stripped off --> | ||
| + | <xsl:call-template name="strip-leading-space"> | ||
| + | <xsl:with-param name="str" select="substring( $str, 2 )"/> | ||
| + | </xsl:call-template> | ||
| + | </xsl:when> | ||
| + | <xsl:otherwise> | ||
| + | <!-- no: then there is no leading whitespace, this is the string we wish to return --> | ||
| + | <xsl:value-of select="$str"/> | ||
| + | </xsl:otherwise> | ||
| + | </xsl:choose> | ||
</xsl:template> | </xsl:template> | ||
<!-- Match (or handle when called explicitly) any text node that contains --> | <!-- Match (or handle when called explicitly) any text node that contains --> | ||
<!-- a soft hyphen. --> | <!-- a soft hyphen. --> | ||
| − | <xsl:template match="text()[contains(.,'&# | + | <xsl:template match="text()[contains(.,'&#xAD;')]" name="hasShy"> |
<!-- Should we be stripping off the first word of this text node? --> | <!-- Should we be stripping off the first word of this text node? --> | ||
<xsl:param name="strip1st">false</xsl:param> <!-- not unless asked --> | <xsl:param name="strip1st">false</xsl:param> <!-- not unless asked --> | ||
| Line 75: | Line 108: | ||
</xsl:choose> | </xsl:choose> | ||
</xsl:variable> | </xsl:variable> | ||
| − | <!-- | + | <!-- Get the string content of the next text node (other than text nodes that --> |
| + | <!-- have nothing but whitespace) w/o its leading whitespace, if any. --> | ||
| + | <!-- (Put it in variable $nextText.) --> | ||
<xsl:variable name="nextText"> | <xsl:variable name="nextText"> | ||
| − | <xsl: | + | <xsl:call-template name="strip-leading-space"> |
| + | <xsl:with-param name="str" select="normalize-space(following-sibling::text()[not(normalize-space(.)='')][1])"/> | ||
| + | </xsl:call-template> | ||
</xsl:variable> | </xsl:variable> | ||
<!-- some rudimentary error-checking --> | <!-- some rudimentary error-checking --> | ||
<xsl:choose> | <xsl:choose> | ||
| − | <xsl:when test="contains(substring-after($me,'­'),'­')"> | + | <xsl:when test="contains(substring-after($me,'&#xAD;'),'&#xAD;')"> |
| − | <xsl:message>Oh dear. More than 1 &shy; in this text sequence, I'm not going to get this one right.</xsl:message> | + | <xsl:message>Oh dear. More than 1 &amp;shy; in this text sequence, I'm not going to get this one right.</xsl:message> |
</xsl:when> | </xsl:when> | ||
| − | <xsl:when test=" | + | <xsl:when test="string-length( substring-after( $me,'&#xAD;') ) > 0"> |
| − | <xsl:message>Oh dear. This &shy; is not the last (non-whitespace) character of the text node. I'm probably going to mess this up.</xsl:message> | + | <xsl:message>Oh dear. This &amp;shy; is not the last (non-whitespace) character of the text node. I'm probably going to mess this up.</xsl:message> |
</xsl:when> | </xsl:when> | ||
</xsl:choose> | </xsl:choose> | ||
<!-- output myself up to, but not including, shy --> | <!-- output myself up to, but not including, shy --> | ||
| − | <xsl:value-of select="substring-before($me,'­')"/> | + | <xsl:value-of select="substring-before($me,'&#xAD;')"/> |
<!-- first token of next text node --> | <!-- first token of next text node --> | ||
<xsl:choose> | <xsl:choose> | ||
| Line 103: | Line 140: | ||
</xsl:choose> | </xsl:choose> | ||
<!-- a newline to separate this word from the rest --> | <!-- a newline to separate this word from the rest --> | ||
| − | <xsl:text>
</xsl:text> | + | <xsl:text>&#x0A;</xsl:text> |
</xsl:template> | </xsl:template> | ||
| − | <!-- Now match text nodes for which the preceding text node had a shy --> | + | <!-- Now match text nodes for which the preceding text node had a shy. --> |
| − | <!-- | + | <!-- We want this template to be higher priority than 'hasShy', so that --> |
<!-- when we hit a line that meets both criteria (has a shy, previous --> | <!-- when we hit a line that meets both criteria (has a shy, previous --> | ||
<!-- text node had a shy), we come here first --> | <!-- text node had a shy), we come here first --> | ||
| − | <xsl:template match="text()[preceding-sibling::text()[1][contains(.,'­')]]" priority="1"> | + | <xsl:template match="text()[preceding-sibling::text()[not(normalize-space(.)='')][1][contains(.,'&#xAD;')]]" priority="1"> |
<!-- preceding text node ended in shy, so it was caught by the hasShy template, --> | <!-- preceding text node ended in shy, so it was caught by the hasShy template, --> | ||
<!-- which grabbed and printed out the first token of this text node. So our --> | <!-- which grabbed and printed out the first token of this text node. So our --> | ||
<!-- goal is to print out the rest. But if this node also ends in a shy, we have --> | <!-- goal is to print out the rest. But if this node also ends in a shy, we have --> | ||
| − | <!-- process it accordingly, too. --> | + | <!-- to process it accordingly, too. --> |
<xsl:choose> | <xsl:choose> | ||
| − | <xsl:when test="contains(.,'&# | + | <xsl:when test="contains(.,'&#xAD;')"> |
<xsl:call-template name="hasShy"> | <xsl:call-template name="hasShy"> | ||
<xsl:with-param name="strip1st">true</xsl:with-param> | <xsl:with-param name="strip1st">true</xsl:with-param> | ||
| Line 122: | Line 159: | ||
</xsl:when> | </xsl:when> | ||
<xsl:otherwise> | <xsl:otherwise> | ||
| − | <xsl:value-of select="substring-after( | + | <!-- create a copy of myself (remember, I'm a text node) without leading --> |
| + | <!-- whitespace, and with other whitespace characters converted to blank --> | ||
| + | <xsl:variable name="sans-leading"> | ||
| + | <xsl:call-template name="strip-leading-space"> | ||
| + | <xsl:with-param name="str"><xsl:value-of select="translate(.,'&#x09;&#x0D;&#x0A;',' ')"/></xsl:with-param> | ||
| + | </xsl:call-template> | ||
| + | </xsl:variable> | ||
| + | <!-- return everything after the first whitespace character --> | ||
| + | <!-- (now that we've removed leading spaces, that means everything --> | ||
| + | <!-- except the first token) --> | ||
| + | <xsl:value-of select="substring-after( $sans-leading,' ')"/> | ||
</xsl:otherwise> | </xsl:otherwise> | ||
</xsl:choose> | </xsl:choose> | ||
| Line 129: | Line 176: | ||
</xsl:stylesheet> | </xsl:stylesheet> | ||
</nowiki></pre> | </nowiki></pre> | ||
| + | |||
I make no claim that this is even a good, let alone the best, way to do this. It did work on my test files, though. | I make no claim that this is even a good, let alone the best, way to do this. It did work on my test files, though. | ||
| + | |||
| + | [[Category:XSLT]][[Category:XSLT:1.0]] | ||
Latest revision as of 16:34, 8 August 2008
A stylesheet to read in TEI (or other XML files) and remove line-end soft hyphens. See header comment for details.
<?xml version="1.0" encoding="UTF-8"?>
<!-- unshy.xslt -->
<!-- Reads in an XML file, writes out the same file with end-of-line soft hyphens -->
<!-- removed. Any whitespace between the soft hyphen and the next text node is -->
<!-- also removed. The markup between the soft hyphen and the next text node is -->
<!-- inserted immediately before the first whitespace character of the next text node -->
<!-- (or after it, if it has no internal whitespace). -->
<!-- Written 2008-07-24/26 by Syd Bauman -->
<!-- Updated 2008-08-07/08 by Syd Bauman: -->
<!-- bug fixes: -->
<!-- * incorrect conditional on last non-whitespace error message test -->
<!-- * skip all-whitespace nodes -->
<!-- * trim off leading whitespace, rather than completely normalize, so -->
<!-- that we don't nuke trailing space (sometimes it's important) -->
<!-- Copyleft 2008 Syd Bauman and the Brown University Women Writers Project -->
<!-- -->
<!-- Known issues (some would consider them bugs, others would say features): -->
<!-- * If there is more than one shy in a given text node, we end up -->
<!-- doing the wrong thing: deleting all the text after the *first* -->
<!-- shy, not the last. However, in this case we issue a warning -->
<!-- message, and since this case is never supposed to happen at the -->
<!-- WWP, we'd prefer the warning to better behavior. -->
<!-- * If the shy is not the last non-whitespace character, we end up -->
<!-- doing the wrong thing: deleting all the text after the shy, rather -->
<!-- than just any following whitespace. But again, this situation is -->
<!-- never supposed to happen at the WWP, so we'd prefer the warning to -->
<!-- doing the right thing. -->
<!-- * If the shy is the last text character in the document, it is silently -->
<!-- removed. It should probably be removed, but IMHO a warning would be a -->
<!-- good idea. -->
<!-- In truth, the right thing to do might be to validate that shys are plqced -->
<!-- appropiately with some other software (like a Schematron schema), and then -->
<!-- in this stylesheet test for shy only at end-of-line (except for whitespace). -->
<xsl:stylesheet version="1.0"
xmlns:xsl="http://www.w3.org/1999/XSL/Transform" >
<!-- housekeeping -->
<xsl:output encoding="UTF-8" omit-xml-declaration="no" method="xml"/>
<!-- start at root, match any children (i.e., root element, probably /TEI or /tei:TEI, -->
<!-- and any comments or PIs outside the root element) -->
<xsl:template match="/">
<xsl:apply-templates/>
</xsl:template>
<!-- For any node (element, attribute, text, PI, or comment) that is not -->
<!-- matched more specifically below, copy it to itself. Thus, except for -->
<!-- the templates below, this stylesheet is the identity transoform. -->
<xsl:template match="@*|node()">
<xsl:copy>
<xsl:apply-templates select="@*|node()"/>
</xsl:copy>
</xsl:template>
<!-- Subroutine that strips leading space off of a string, but, -->
<!-- unlike normalize-space(), leaves trailing and internal -->
<!-- space untouched. -->
<!-- Input: parameter "str", a string -->
<!-- Returns: value of "str" w/o leading space -->
<!-- Calls: itself -->
<xsl:template name="strip-leading-space">
<!-- first and only parameter is a string -->
<xsl:param name="str"/>
<!-- parse off first character to examine -->
<xsl:variable name="first" select="substring( $str, 1, 1 )"/>
<xsl:choose>
<!-- test 1st char to see if it is a whitespace char as defined by -->
<!-- the 'S' production of the XML specification. -->
<xsl:when test="$first=' ' or $first='	' or $first='
' or $first='
'">
<!-- yes: recursively call myself with the string parameter but with the -->
<!-- first character (which is whitespace) stripped off -->
<xsl:call-template name="strip-leading-space">
<xsl:with-param name="str" select="substring( $str, 2 )"/>
</xsl:call-template>
</xsl:when>
<xsl:otherwise>
<!-- no: then there is no leading whitespace, this is the string we wish to return -->
<xsl:value-of select="$str"/>
</xsl:otherwise>
</xsl:choose>
</xsl:template>
<!-- Match (or handle when called explicitly) any text node that contains -->
<!-- a soft hyphen. -->
<xsl:template match="text()[contains(.,'­')]" name="hasShy">
<!-- Should we be stripping off the first word of this text node? -->
<xsl:param name="strip1st">false</xsl:param> <!-- not unless asked -->
<!-- set the text to be processed (putting it in variable $me) -->
<xsl:variable name="me">
<xsl:choose>
<!-- if we've been asked to strip off the first word, -->
<xsl:when test="$strip1st = 'true'">
<!-- do so -->
<xsl:choose>
<!-- if there's a blank, -->
<xsl:when test="contains(.,' ')">
<!-- take everything after it -->
<xsl:value-of select="substring-after(normalize-space(.),' ')"/>
</xsl:when>
<xsl:otherwise><!-- return null if there only is one word --></xsl:otherwise>
</xsl:choose>
</xsl:when>
<!-- else just normalize space, just so things are handled evenly-->
<xsl:otherwise><xsl:value-of select="normalize-space(.)"/></xsl:otherwise>
</xsl:choose>
</xsl:variable>
<!-- Get the string content of the next text node (other than text nodes that -->
<!-- have nothing but whitespace) w/o its leading whitespace, if any. -->
<!-- (Put it in variable $nextText.) -->
<xsl:variable name="nextText">
<xsl:call-template name="strip-leading-space">
<xsl:with-param name="str" select="normalize-space(following-sibling::text()[not(normalize-space(.)='')][1])"/>
</xsl:call-template>
</xsl:variable>
<!-- some rudimentary error-checking -->
<xsl:choose>
<xsl:when test="contains(substring-after($me,'­'),'­')">
<xsl:message>Oh dear. More than 1 &shy; in this text sequence, I'm not going to get this one right.</xsl:message>
</xsl:when>
<xsl:when test="string-length( substring-after( $me,'­') ) > 0">
<xsl:message>Oh dear. This &shy; is not the last (non-whitespace) character of the text node. I'm probably going to mess this up.</xsl:message>
</xsl:when>
</xsl:choose>
<!-- output myself up to, but not including, shy -->
<xsl:value-of select="substring-before($me,'­')"/>
<!-- first token of next text node -->
<xsl:choose>
<!-- if there is a blank, -->
<xsl:when test="contains($nextText,' ')">
<!-- take chars up to it -->
<xsl:value-of select="substring-before($nextText,' ')"/>
</xsl:when>
<xsl:otherwise>
<!-- else just take entire text string -->
<xsl:value-of select="$nextText"/>
</xsl:otherwise>
</xsl:choose>
<!-- a newline to separate this word from the rest -->
<xsl:text>
</xsl:text>
</xsl:template>
<!-- Now match text nodes for which the preceding text node had a shy. -->
<!-- We want this template to be higher priority than 'hasShy', so that -->
<!-- when we hit a line that meets both criteria (has a shy, previous -->
<!-- text node had a shy), we come here first -->
<xsl:template match="text()[preceding-sibling::text()[not(normalize-space(.)='')][1][contains(.,'­')]]" priority="1">
<!-- preceding text node ended in shy, so it was caught by the hasShy template, -->
<!-- which grabbed and printed out the first token of this text node. So our -->
<!-- goal is to print out the rest. But if this node also ends in a shy, we have -->
<!-- to process it accordingly, too. -->
<xsl:choose>
<xsl:when test="contains(.,'­')">
<xsl:call-template name="hasShy">
<xsl:with-param name="strip1st">true</xsl:with-param>
</xsl:call-template>
</xsl:when>
<xsl:otherwise>
<!-- create a copy of myself (remember, I'm a text node) without leading -->
<!-- whitespace, and with other whitespace characters converted to blank -->
<xsl:variable name="sans-leading">
<xsl:call-template name="strip-leading-space">
<xsl:with-param name="str"><xsl:value-of select="translate(.,'	
',' ')"/></xsl:with-param>
</xsl:call-template>
</xsl:variable>
<!-- return everything after the first whitespace character -->
<!-- (now that we've removed leading spaces, that means everything -->
<!-- except the first token) -->
<xsl:value-of select="substring-after( $sans-leading,' ')"/>
</xsl:otherwise>
</xsl:choose>
</xsl:template>
</xsl:stylesheet>
I make no claim that this is even a good, let alone the best, way to do this. It did work on my test files, though.