Unshy.xslt

From TEIWiki
Jump to navigation Jump to search

A stylesheet to read in TEI (or other XML files) and remove line-end soft hyphens. See header comment for details.

<?xml version="1.0" encoding="UTF-8"?>
<!-- unshy.xslt -->
<!-- Reads in an XML file, writes out the same file with end-of-line soft hyphens -->
<!-- removed. Any whitespace between the soft hyphen and the next text node is -->
<!-- also removed. The markup between the soft hyphen and the next text node is -->
<!-- inserted immediately before the first whitespace character of the next text node -->
<!-- (or after it, if it has no internal whitespace). -->
<!-- Written 2008-07-24/26 by Syd Bauman -->
<!-- Updated 2008-08-07/08 by Syd Bauman: -->
<!--    bug fixes: -->
<!--    * incorrect conditional on last non-whitespace error message test -->
<!--    * skip all-whitespace nodes -->
<!--    * trim off leading whitespace, rather than completely normalize, so -->
<!--      that we don't nuke trailing space (sometimes it's important) -->
<!-- Copyleft 2008 Syd Bauman and the Brown University Women Writers Project -->
<!--  -->
<!-- Known issues (some would consider them bugs, others would say features): -->
<!-- * If there is more than one shy in a given text node, we end up -->
<!--   doing the wrong thing: deleting all the text after the *first* -->
<!--   shy, not the last. However, in this case we issue a warning -->
<!--   message, and since this case is never supposed to happen at the -->
<!--   WWP, we'd prefer the warning to better behavior. -->
<!-- * If the shy is not the last non-whitespace character, we end up -->
<!--   doing the wrong thing: deleting all the text after the shy, rather -->
<!--   than just any following whitespace. But again, this situation is -->
<!--   never supposed to happen at the WWP, so we'd prefer the warning to -->
<!--   doing the right thing. -->
<!-- * If the shy is the last text character in the document, it is silently -->
<!--   removed. It should probably be removed, but IMHO a warning would be a -->
<!--   good idea. -->
<!-- In truth, the right thing to do might be to validate that shys are plqced -->
<!-- appropiately with some other software (like a Schematron schema), and then -->
<!-- in this stylesheet test for shy only at end-of-line (except for whitespace). -->
<xsl:stylesheet version="1.0"
  xmlns:xsl="http://www.w3.org/1999/XSL/Transform" >

  <!-- housekeeping -->
  <xsl:output encoding="UTF-8" omit-xml-declaration="no" method="xml"/>
  
  <!-- start at root, match any children (i.e., root element, probably /TEI or /tei:TEI, -->
  <!-- and any comments or PIs outside the root element) -->
  <xsl:template match="/">
    <xsl:apply-templates/>
  </xsl:template>
  
  <!-- For any node (element, attribute, text, PI, or comment) that is not -->
  <!-- matched more specifically below, copy it to itself. Thus, except for -->
  <!-- the templates below, this stylesheet is the identity transoform. -->
  <xsl:template match="@*|node()">
    <xsl:copy>
      <xsl:apply-templates select="@*|node()"/>
    </xsl:copy>
  </xsl:template>

  <!-- Subroutine that strips leading space off of a string, but, -->
  <!-- unlike normalize-space(), leaves trailing and internal -->
  <!-- space untouched. -->
  <!-- Input: parameter "str", a string -->
  <!-- Returns: value of "str" w/o leading space -->
  <!-- Calls: itself -->
  <xsl:template name="strip-leading-space">
    <!-- first and only parameter is a string -->
    <xsl:param name="str"/>
    <!-- parse off first character to examine -->
    <xsl:variable name="first" select="substring( $str, 1, 1 )"/>
    <xsl:choose>
      <!-- test 1st char to see if it is a whitespace char as defined by -->
      <!-- the 'S' production of the XML specification. -->
      <xsl:when test="$first='&#x20;' or $first='&#x09;' or $first='&#x0D;' or $first='&#x0A;'">
        <!-- yes: recursively call myself with the string parameter but with the -->
        <!-- first character (which is whitespace) stripped off -->
        <xsl:call-template name="strip-leading-space">
          <xsl:with-param name="str" select="substring( $str, 2 )"/>
        </xsl:call-template>
      </xsl:when>
      <xsl:otherwise>
        <!-- no: then there is no leading whitespace, this is the string we wish to return -->
        <xsl:value-of select="$str"/>
      </xsl:otherwise>
    </xsl:choose>
  </xsl:template>
  
  <!-- Match (or handle when called explicitly) any text node that contains -->
  <!-- a soft hyphen. -->
  <xsl:template match="text()[contains(.,'&#xAD;')]" name="hasShy">
    <!-- Should we be stripping off the first word of this text node? -->
    <xsl:param name="strip1st">false</xsl:param> <!-- not unless asked -->
    <!-- set the text to be processed (putting it in variable $me) -->
    <xsl:variable name="me">
      <xsl:choose>
        <!-- if we've been asked to strip off the first word, -->
        <xsl:when test="$strip1st = 'true'">
          <!-- do so -->
          <xsl:choose>
            <!-- if there's a blank, -->
            <xsl:when test="contains(.,' ')">
              <!-- take everything after it -->
              <xsl:value-of select="substring-after(normalize-space(.),' ')"/>
            </xsl:when>
            <xsl:otherwise><!-- return null if there only is one word --></xsl:otherwise>
          </xsl:choose>
        </xsl:when>
        <!-- else just normalize space, just so things are handled evenly-->
        <xsl:otherwise><xsl:value-of select="normalize-space(.)"/></xsl:otherwise>
      </xsl:choose>
    </xsl:variable>
    <!-- Get the string content of the next text node (other than text nodes that -->
    <!-- have nothing but whitespace) w/o its leading whitespace, if any. -->
    <!-- (Put it in variable $nextText.) -->
    <xsl:variable name="nextText">
      <xsl:call-template name="strip-leading-space">
      <xsl:with-param name="str" select="normalize-space(following-sibling::text()[not(normalize-space(.)='')][1])"/>
      </xsl:call-template>
    </xsl:variable>
    <!-- some rudimentary error-checking -->
    <xsl:choose>
      <xsl:when test="contains(substring-after($me,'&#xAD;'),'&#xAD;')">
        <xsl:message>Oh dear. More than 1 &amp;shy; in this text sequence, I'm not going to get this one right.</xsl:message>
      </xsl:when>
      <xsl:when test="string-length( substring-after( $me,'&#xAD;') ) > 0">
        <xsl:message>Oh dear. This &amp;shy; is not the last (non-whitespace) character of the text node. I'm probably going to mess this up.</xsl:message>
      </xsl:when>
    </xsl:choose>
    <!-- output myself up to, but not including, shy -->    
    <xsl:value-of select="substring-before($me,'&#xAD;')"/>
    <!-- first token of next text node -->
    <xsl:choose>
      <!-- if there is a blank, -->
      <xsl:when test="contains($nextText,' ')">
        <!-- take chars up to it -->
        <xsl:value-of select="substring-before($nextText,' ')"/>
      </xsl:when>
      <xsl:otherwise>
        <!-- else just take entire text string -->
        <xsl:value-of select="$nextText"/>
      </xsl:otherwise>
    </xsl:choose>
    <!-- a newline to separate this word from the rest -->
    <xsl:text>&#x0A;</xsl:text>
  </xsl:template>

  <!-- Now match text nodes for which the preceding text node had a shy. -->
  <!-- We want this template to be higher priority than 'hasShy', so that -->
  <!-- when we hit a line that meets both criteria (has a shy, previous -->
  <!-- text node had a shy), we come here first -->
  <xsl:template match="text()[preceding-sibling::text()[not(normalize-space(.)='')][1][contains(.,'&#xAD;')]]" priority="1">
    <!-- preceding text node ended in shy, so it was caught by the hasShy template, -->
    <!-- which grabbed and printed out the first token of this text node. So our -->
    <!-- goal is to print out the rest. But if this node also ends in a shy, we have -->
    <!-- to process it accordingly, too. -->
    <xsl:choose>
      <xsl:when test="contains(.,'&#xAD;')">
        <xsl:call-template name="hasShy">
          <xsl:with-param name="strip1st">true</xsl:with-param>
        </xsl:call-template>
      </xsl:when>
      <xsl:otherwise>
        <!-- create a copy of myself (remember, I'm a text node) without leading -->
        <!-- whitespace, and with other whitespace characters converted to blank -->
        <xsl:variable name="sans-leading">
          <xsl:call-template name="strip-leading-space">
            <xsl:with-param name="str"><xsl:value-of select="translate(.,'&#x09;&#x0D;&#x0A;','   ')"/></xsl:with-param>
          </xsl:call-template>
        </xsl:variable>
        <!-- return everything after the first whitespace character -->
        <!-- (now that we've removed leading spaces, that means everything -->
        <!-- except the first token) -->
        <xsl:value-of select="substring-after( $sans-leading,' ')"/>
      </xsl:otherwise>
    </xsl:choose>
  </xsl:template>
  
</xsl:stylesheet>

I make no claim that this is even a good, let alone the best, way to do this. It did work on my test files, though.