Difference between revisions of "Unshy.xslt"
Jump to navigation
Jump to search
(bug fixes) |
|||
(2 intermediate revisions by the same user not shown) | |||
Line 10: | Line 10: | ||
<!-- (or after it, if it has no internal whitespace). --> | <!-- (or after it, if it has no internal whitespace). --> | ||
<!-- Written 2008-07-24/26 by Syd Bauman --> | <!-- Written 2008-07-24/26 by Syd Bauman --> | ||
+ | <!-- Updated 2008-08-07/08 by Syd Bauman: --> | ||
+ | <!-- bug fixes: --> | ||
+ | <!-- * incorrect conditional on last non-whitespace error message test --> | ||
+ | <!-- * skip all-whitespace nodes --> | ||
+ | <!-- * trim off leading whitespace, rather than completely normalize, so --> | ||
+ | <!-- that we don't nuke trailing space (sometimes it's important) --> | ||
<!-- Copyleft 2008 Syd Bauman and the Brown University Women Writers Project --> | <!-- Copyleft 2008 Syd Bauman and the Brown University Women Writers Project --> | ||
<!-- --> | <!-- --> | ||
− | <!-- Known issues (some would consider them bugs, | + | <!-- Known issues (some would consider them bugs, others would say features): --> |
− | <!-- * If there | + | <!-- * If there is more than one shy in a given text node, we end up --> |
<!-- doing the wrong thing: deleting all the text after the *first* --> | <!-- doing the wrong thing: deleting all the text after the *first* --> | ||
<!-- shy, not the last. However, in this case we issue a warning --> | <!-- shy, not the last. However, in this case we issue a warning --> | ||
Line 26: | Line 32: | ||
<!-- removed. It should probably be removed, but IMHO a warning would be a --> | <!-- removed. It should probably be removed, but IMHO a warning would be a --> | ||
<!-- good idea. --> | <!-- good idea. --> | ||
− | <!-- In truth, the right thing to do might be to validate that shys are | + | <!-- In truth, the right thing to do might be to validate that shys are plqced --> |
<!-- appropiately with some other software (like a Schematron schema), and then --> | <!-- appropiately with some other software (like a Schematron schema), and then --> | ||
<!-- in this stylesheet test for shy only at end-of-line (except for whitespace). --> | <!-- in this stylesheet test for shy only at end-of-line (except for whitespace). --> | ||
<xsl:stylesheet version="1.0" | <xsl:stylesheet version="1.0" | ||
− | xmlns:xsl="http://www.w3.org/1999/XSL/Transform | + | xmlns:xsl="http://www.w3.org/1999/XSL/Transform" > |
− | |||
<!-- housekeeping --> | <!-- housekeeping --> | ||
Line 49: | Line 54: | ||
<xsl:apply-templates select="@*|node()"/> | <xsl:apply-templates select="@*|node()"/> | ||
</xsl:copy> | </xsl:copy> | ||
+ | </xsl:template> | ||
+ | |||
+ | <!-- Subroutine that strips leading space off of a string, but, --> | ||
+ | <!-- unlike normalize-space(), leaves trailing and internal --> | ||
+ | <!-- space untouched. --> | ||
+ | <!-- Input: parameter "str", a string --> | ||
+ | <!-- Returns: value of "str" w/o leading space --> | ||
+ | <!-- Calls: itself --> | ||
+ | <xsl:template name="strip-leading-space"> | ||
+ | <!-- first and only parameter is a string --> | ||
+ | <xsl:param name="str"/> | ||
+ | <!-- parse off first character to examine --> | ||
+ | <xsl:variable name="first" select="substring( $str, 1, 1 )"/> | ||
+ | <xsl:choose> | ||
+ | <!-- test 1st char to see if it is a whitespace char as defined by --> | ||
+ | <!-- the 'S' production of the XML specification. --> | ||
+ | <xsl:when test="$first='&#x20;' or $first='&#x09;' or $first='&#x0D;' or $first='&#x0A;'"> | ||
+ | <!-- yes: recursively call myself with the string parameter but with the --> | ||
+ | <!-- first character (which is whitespace) stripped off --> | ||
+ | <xsl:call-template name="strip-leading-space"> | ||
+ | <xsl:with-param name="str" select="substring( $str, 2 )"/> | ||
+ | </xsl:call-template> | ||
+ | </xsl:when> | ||
+ | <xsl:otherwise> | ||
+ | <!-- no: then there is no leading whitespace, this is the string we wish to return --> | ||
+ | <xsl:value-of select="$str"/> | ||
+ | </xsl:otherwise> | ||
+ | </xsl:choose> | ||
</xsl:template> | </xsl:template> | ||
<!-- Match (or handle when called explicitly) any text node that contains --> | <!-- Match (or handle when called explicitly) any text node that contains --> | ||
<!-- a soft hyphen. --> | <!-- a soft hyphen. --> | ||
− | <xsl:template match="text()[contains(.,'&# | + | <xsl:template match="text()[contains(.,'&#xAD;')]" name="hasShy"> |
<!-- Should we be stripping off the first word of this text node? --> | <!-- Should we be stripping off the first word of this text node? --> | ||
<xsl:param name="strip1st">false</xsl:param> <!-- not unless asked --> | <xsl:param name="strip1st">false</xsl:param> <!-- not unless asked --> | ||
Line 75: | Line 108: | ||
</xsl:choose> | </xsl:choose> | ||
</xsl:variable> | </xsl:variable> | ||
− | <!-- | + | <!-- Get the string content of the next text node (other than text nodes that --> |
+ | <!-- have nothing but whitespace) w/o its leading whitespace, if any. --> | ||
+ | <!-- (Put it in variable $nextText.) --> | ||
<xsl:variable name="nextText"> | <xsl:variable name="nextText"> | ||
− | <xsl: | + | <xsl:call-template name="strip-leading-space"> |
+ | <xsl:with-param name="str" select="normalize-space(following-sibling::text()[not(normalize-space(.)='')][1])"/> | ||
+ | </xsl:call-template> | ||
</xsl:variable> | </xsl:variable> | ||
<!-- some rudimentary error-checking --> | <!-- some rudimentary error-checking --> | ||
<xsl:choose> | <xsl:choose> | ||
− | <xsl:when test="contains(substring-after($me,'­'),'­')"> | + | <xsl:when test="contains(substring-after($me,'&#xAD;'),'&#xAD;')"> |
− | <xsl:message>Oh dear. More than 1 &shy; in this text sequence, I'm not going to get this one right.</xsl:message> | + | <xsl:message>Oh dear. More than 1 &amp;shy; in this text sequence, I'm not going to get this one right.</xsl:message> |
</xsl:when> | </xsl:when> | ||
− | <xsl:when test=" | + | <xsl:when test="string-length( substring-after( $me,'&#xAD;') ) > 0"> |
− | <xsl:message>Oh dear. This &shy; is not the last (non-whitespace) character of the text node. I'm probably going to mess this up.</xsl:message> | + | <xsl:message>Oh dear. This &amp;shy; is not the last (non-whitespace) character of the text node. I'm probably going to mess this up.</xsl:message> |
</xsl:when> | </xsl:when> | ||
</xsl:choose> | </xsl:choose> | ||
<!-- output myself up to, but not including, shy --> | <!-- output myself up to, but not including, shy --> | ||
− | <xsl:value-of select="substring-before($me,'­')"/> | + | <xsl:value-of select="substring-before($me,'&#xAD;')"/> |
<!-- first token of next text node --> | <!-- first token of next text node --> | ||
<xsl:choose> | <xsl:choose> | ||
Line 103: | Line 140: | ||
</xsl:choose> | </xsl:choose> | ||
<!-- a newline to separate this word from the rest --> | <!-- a newline to separate this word from the rest --> | ||
− | <xsl:text>
</xsl:text> | + | <xsl:text>&#x0A;</xsl:text> |
</xsl:template> | </xsl:template> | ||
− | <!-- Now match text nodes for which the preceding text node had a shy --> | + | <!-- Now match text nodes for which the preceding text node had a shy. --> |
− | <!-- | + | <!-- We want this template to be higher priority than 'hasShy', so that --> |
<!-- when we hit a line that meets both criteria (has a shy, previous --> | <!-- when we hit a line that meets both criteria (has a shy, previous --> | ||
<!-- text node had a shy), we come here first --> | <!-- text node had a shy), we come here first --> | ||
− | <xsl:template match="text()[preceding-sibling::text()[1][contains(.,'­')]]" priority="1"> | + | <xsl:template match="text()[preceding-sibling::text()[not(normalize-space(.)='')][1][contains(.,'&#xAD;')]]" priority="1"> |
<!-- preceding text node ended in shy, so it was caught by the hasShy template, --> | <!-- preceding text node ended in shy, so it was caught by the hasShy template, --> | ||
<!-- which grabbed and printed out the first token of this text node. So our --> | <!-- which grabbed and printed out the first token of this text node. So our --> | ||
<!-- goal is to print out the rest. But if this node also ends in a shy, we have --> | <!-- goal is to print out the rest. But if this node also ends in a shy, we have --> | ||
− | <!-- process it accordingly, too. --> | + | <!-- to process it accordingly, too. --> |
<xsl:choose> | <xsl:choose> | ||
− | <xsl:when test="contains(.,'&# | + | <xsl:when test="contains(.,'&#xAD;')"> |
<xsl:call-template name="hasShy"> | <xsl:call-template name="hasShy"> | ||
<xsl:with-param name="strip1st">true</xsl:with-param> | <xsl:with-param name="strip1st">true</xsl:with-param> | ||
Line 122: | Line 159: | ||
</xsl:when> | </xsl:when> | ||
<xsl:otherwise> | <xsl:otherwise> | ||
− | <xsl:value-of select="substring-after( | + | <!-- create a copy of myself (remember, I'm a text node) without leading --> |
+ | <!-- whitespace, and with other whitespace characters converted to blank --> | ||
+ | <xsl:variable name="sans-leading"> | ||
+ | <xsl:call-template name="strip-leading-space"> | ||
+ | <xsl:with-param name="str"><xsl:value-of select="translate(.,'&#x09;&#x0D;&#x0A;',' ')"/></xsl:with-param> | ||
+ | </xsl:call-template> | ||
+ | </xsl:variable> | ||
+ | <!-- return everything after the first whitespace character --> | ||
+ | <!-- (now that we've removed leading spaces, that means everything --> | ||
+ | <!-- except the first token) --> | ||
+ | <xsl:value-of select="substring-after( $sans-leading,' ')"/> | ||
</xsl:otherwise> | </xsl:otherwise> | ||
</xsl:choose> | </xsl:choose> | ||
Line 129: | Line 176: | ||
</xsl:stylesheet> | </xsl:stylesheet> | ||
</nowiki></pre> | </nowiki></pre> | ||
+ | |||
I make no claim that this is even a good, let alone the best, way to do this. It did work on my test files, though. | I make no claim that this is even a good, let alone the best, way to do this. It did work on my test files, though. | ||
+ | |||
+ | [[Category:XSLT]][[Category:XSLT:1.0]] |
Latest revision as of 17:34, 8 August 2008
A stylesheet to read in TEI (or other XML files) and remove line-end soft hyphens. See header comment for details.
<?xml version="1.0" encoding="UTF-8"?> <!-- unshy.xslt --> <!-- Reads in an XML file, writes out the same file with end-of-line soft hyphens --> <!-- removed. Any whitespace between the soft hyphen and the next text node is --> <!-- also removed. The markup between the soft hyphen and the next text node is --> <!-- inserted immediately before the first whitespace character of the next text node --> <!-- (or after it, if it has no internal whitespace). --> <!-- Written 2008-07-24/26 by Syd Bauman --> <!-- Updated 2008-08-07/08 by Syd Bauman: --> <!-- bug fixes: --> <!-- * incorrect conditional on last non-whitespace error message test --> <!-- * skip all-whitespace nodes --> <!-- * trim off leading whitespace, rather than completely normalize, so --> <!-- that we don't nuke trailing space (sometimes it's important) --> <!-- Copyleft 2008 Syd Bauman and the Brown University Women Writers Project --> <!-- --> <!-- Known issues (some would consider them bugs, others would say features): --> <!-- * If there is more than one shy in a given text node, we end up --> <!-- doing the wrong thing: deleting all the text after the *first* --> <!-- shy, not the last. However, in this case we issue a warning --> <!-- message, and since this case is never supposed to happen at the --> <!-- WWP, we'd prefer the warning to better behavior. --> <!-- * If the shy is not the last non-whitespace character, we end up --> <!-- doing the wrong thing: deleting all the text after the shy, rather --> <!-- than just any following whitespace. But again, this situation is --> <!-- never supposed to happen at the WWP, so we'd prefer the warning to --> <!-- doing the right thing. --> <!-- * If the shy is the last text character in the document, it is silently --> <!-- removed. It should probably be removed, but IMHO a warning would be a --> <!-- good idea. --> <!-- In truth, the right thing to do might be to validate that shys are plqced --> <!-- appropiately with some other software (like a Schematron schema), and then --> <!-- in this stylesheet test for shy only at end-of-line (except for whitespace). --> <xsl:stylesheet version="1.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform" > <!-- housekeeping --> <xsl:output encoding="UTF-8" omit-xml-declaration="no" method="xml"/> <!-- start at root, match any children (i.e., root element, probably /TEI or /tei:TEI, --> <!-- and any comments or PIs outside the root element) --> <xsl:template match="/"> <xsl:apply-templates/> </xsl:template> <!-- For any node (element, attribute, text, PI, or comment) that is not --> <!-- matched more specifically below, copy it to itself. Thus, except for --> <!-- the templates below, this stylesheet is the identity transoform. --> <xsl:template match="@*|node()"> <xsl:copy> <xsl:apply-templates select="@*|node()"/> </xsl:copy> </xsl:template> <!-- Subroutine that strips leading space off of a string, but, --> <!-- unlike normalize-space(), leaves trailing and internal --> <!-- space untouched. --> <!-- Input: parameter "str", a string --> <!-- Returns: value of "str" w/o leading space --> <!-- Calls: itself --> <xsl:template name="strip-leading-space"> <!-- first and only parameter is a string --> <xsl:param name="str"/> <!-- parse off first character to examine --> <xsl:variable name="first" select="substring( $str, 1, 1 )"/> <xsl:choose> <!-- test 1st char to see if it is a whitespace char as defined by --> <!-- the 'S' production of the XML specification. --> <xsl:when test="$first=' ' or $first='	' or $first='
' or $first='
'"> <!-- yes: recursively call myself with the string parameter but with the --> <!-- first character (which is whitespace) stripped off --> <xsl:call-template name="strip-leading-space"> <xsl:with-param name="str" select="substring( $str, 2 )"/> </xsl:call-template> </xsl:when> <xsl:otherwise> <!-- no: then there is no leading whitespace, this is the string we wish to return --> <xsl:value-of select="$str"/> </xsl:otherwise> </xsl:choose> </xsl:template> <!-- Match (or handle when called explicitly) any text node that contains --> <!-- a soft hyphen. --> <xsl:template match="text()[contains(.,'­')]" name="hasShy"> <!-- Should we be stripping off the first word of this text node? --> <xsl:param name="strip1st">false</xsl:param> <!-- not unless asked --> <!-- set the text to be processed (putting it in variable $me) --> <xsl:variable name="me"> <xsl:choose> <!-- if we've been asked to strip off the first word, --> <xsl:when test="$strip1st = 'true'"> <!-- do so --> <xsl:choose> <!-- if there's a blank, --> <xsl:when test="contains(.,' ')"> <!-- take everything after it --> <xsl:value-of select="substring-after(normalize-space(.),' ')"/> </xsl:when> <xsl:otherwise><!-- return null if there only is one word --></xsl:otherwise> </xsl:choose> </xsl:when> <!-- else just normalize space, just so things are handled evenly--> <xsl:otherwise><xsl:value-of select="normalize-space(.)"/></xsl:otherwise> </xsl:choose> </xsl:variable> <!-- Get the string content of the next text node (other than text nodes that --> <!-- have nothing but whitespace) w/o its leading whitespace, if any. --> <!-- (Put it in variable $nextText.) --> <xsl:variable name="nextText"> <xsl:call-template name="strip-leading-space"> <xsl:with-param name="str" select="normalize-space(following-sibling::text()[not(normalize-space(.)='')][1])"/> </xsl:call-template> </xsl:variable> <!-- some rudimentary error-checking --> <xsl:choose> <xsl:when test="contains(substring-after($me,'­'),'­')"> <xsl:message>Oh dear. More than 1 &shy; in this text sequence, I'm not going to get this one right.</xsl:message> </xsl:when> <xsl:when test="string-length( substring-after( $me,'­') ) > 0"> <xsl:message>Oh dear. This &shy; is not the last (non-whitespace) character of the text node. I'm probably going to mess this up.</xsl:message> </xsl:when> </xsl:choose> <!-- output myself up to, but not including, shy --> <xsl:value-of select="substring-before($me,'­')"/> <!-- first token of next text node --> <xsl:choose> <!-- if there is a blank, --> <xsl:when test="contains($nextText,' ')"> <!-- take chars up to it --> <xsl:value-of select="substring-before($nextText,' ')"/> </xsl:when> <xsl:otherwise> <!-- else just take entire text string --> <xsl:value-of select="$nextText"/> </xsl:otherwise> </xsl:choose> <!-- a newline to separate this word from the rest --> <xsl:text>
</xsl:text> </xsl:template> <!-- Now match text nodes for which the preceding text node had a shy. --> <!-- We want this template to be higher priority than 'hasShy', so that --> <!-- when we hit a line that meets both criteria (has a shy, previous --> <!-- text node had a shy), we come here first --> <xsl:template match="text()[preceding-sibling::text()[not(normalize-space(.)='')][1][contains(.,'­')]]" priority="1"> <!-- preceding text node ended in shy, so it was caught by the hasShy template, --> <!-- which grabbed and printed out the first token of this text node. So our --> <!-- goal is to print out the rest. But if this node also ends in a shy, we have --> <!-- to process it accordingly, too. --> <xsl:choose> <xsl:when test="contains(.,'­')"> <xsl:call-template name="hasShy"> <xsl:with-param name="strip1st">true</xsl:with-param> </xsl:call-template> </xsl:when> <xsl:otherwise> <!-- create a copy of myself (remember, I'm a text node) without leading --> <!-- whitespace, and with other whitespace characters converted to blank --> <xsl:variable name="sans-leading"> <xsl:call-template name="strip-leading-space"> <xsl:with-param name="str"><xsl:value-of select="translate(.,'	
',' ')"/></xsl:with-param> </xsl:call-template> </xsl:variable> <!-- return everything after the first whitespace character --> <!-- (now that we've removed leading spaces, that means everything --> <!-- except the first token) --> <xsl:value-of select="substring-after( $sans-leading,' ')"/> </xsl:otherwise> </xsl:choose> </xsl:template> </xsl:stylesheet>
I make no claim that this is even a good, let alone the best, way to do this. It did work on my test files, though.