List words.xslt

This is a relatively simple example of a stylesheet that will generate a list of words, ostensibly for text analysis. It is not intended to be used unmodified. The values of type= attributes of <div>, for example, should be changed to match your project's. Note that this stylesheet does not handle end-of-line hyphenated words properly. You should run your document through something like Unshy.xslt first.

<xsl:stylesheet version="1.0"
  xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
  xmlns:in="http://www.tei-c.org/ns/1.0">

  <!--
  Read in a TEI P5 file, and spit out a text list of words that
  subsequent analysis may find useful.
  Written 2012-07-18 by Syd Bauman, based entirely on WWP's dev/only_words.xslt
  Copyleft 2010 by Syd Bauman and the Brown WWP
  -->

  <!-- First, tell the XSLT engine that we’d like to spit out plain text (instead of -->
  <!-- the default, which is XML) -->
  <xsl:output method="text"/>

  <!-- Ascertain whether the user wants case-folding or not. Default is yes -->
  <xsl:param name="case-fold" select="true()"/>
  
  <!-- Match the document root (which itself contains at least the root element, and probably -->
  <!-- some comments and processing instructions, too) -->
  <xsl:template match="/">
    <!-- In our first pass through the document tree, we don't spit the output out, but -->
    <!-- rather save it in a variable for further processing. -->
    <xsl:variable name="documentData">
      <!-- process the TEI document or documents, whether we're reading a corpus or not -->
      <xsl:apply-templates select="/in:TEI | /in:teiCorpus/in:TEI"/>
    </xsl:variable>
    <!-- Now we have all of the “words” in the variable, but we also have lots of -->
    <!-- whitespace, some of it quite weird. What we’d like is just a list, so -->
    <!-- convert it. First, reduce all sequences of one or more whitespace characters -->
    <!-- to a single blank; then turn all blanks to newlines. -->
    <xsl:value-of select="translate(normalize-space($documentData),' ','
')"/>
  </xsl:template>

  <!-- For any element that isn’t matched in a specific template, below, -->
  <!-- just process all of its element and textual children (thus ignoring -->
  <!-- any comments or processing instructions). Note that this template -->
  <!-- catches the in:TEI from the root template, above. -->
  <xsl:template match="*">
    <xsl:apply-templates select="*|text()"/>
  </xsl:template>
  
  <!-- For any text nodes we process, convert punctuation to a space and -->
  <!-- spit out the result. (Remember that what we spit out is going to -->
  <!-- the variable $documentData, not directly to output.) -->
  <xsl:template match="text()">
    <xsl:variable name="no-punctuation"
      select="translate( ., '“”,.;:?!()-–—…&quot;',
                            '               ')"/>
    <xsl:choose>
      <xsl:when test="$case-fold">
        <xsl:value-of select="translate( $no-punctuation,
          'ABCDEFGHIJKLMNOPQRSTUVWXYZ',
          'abcdefghijklmnopqrstuvwxyz')"/>
        <!-- Note: if you are using alphabets other than the standard -->
        <!-- English 26, either add letters above or convert stylesheet -->
        <!-- to XSLT 2.0 (or 3.0) and use the lower-case() function -->
      </xsl:when>
      <xsl:otherwise>
        <xsl:value-of select="$no-punctuation"/>
      </xsl:otherwise>
    </xsl:choose>
  </xsl:template>
  
  <!-- Some constructs are summarily ignored: the entire element, including -->
  <!-- any text content or sub-elements, is just thrown away. -->
  <xsl:template match="
      in:mw | in:fw
    | in:teiHeader
    | in:back//in:div[@type='editorial']
    | in:div[@type='advert']
    | in:div[@type='castlist']
    | in:div[@type='colophon']
    | in:div[@type='contents']
    | in:div[@type='corrigenda']
    | in:div[@type='index']
    | in:abbr[parent::in:choice/in:expan]
    | in:sic[parent::in:choice/in:corr]
    | in:orig[parent::in:choice/in:reg]
    | in:am[parent::in:choice/in:ex]
    | in:unclear[parent::in:choice and preceding-sibling::in:unclear]
    | in:add
    | in:del
    | in:note[not( @type='authorial' or @resp='author' )]
    | in:figure
    | in:stage
    | in:docImprint
    | in:bibl
    | in:dateline
    | in:respLine
    | comment()
    | processing-instruction()
    | @*
    ">
    <!-- It's not really necessary to catch comments, processing instructions, and -->
    <!-- attributes with this null template, as they will never be processed anyway, -->
    <!-- because we only process elements and text nodes. But it seems like a good -->
    <!-- idea to be explicit: nuke 'em. -->
  </xsl:template>
  
  <!-- Some elements “cause” a word-break, so insert a blank for them -->
  <xsl:template match="
      in:cb
    | in:lb
    | in:pb
    | in:milestone
    | in:div
    | in:p
    ">
    <xsl:text> </xsl:text>
    <xsl:apply-templates select="*|text()"/>
  </xsl:template>
    
</xsl:stylesheet>

List words.xslt

Navigation menu

Personal tools

Namespaces

Variants

Views

More

Search

Navigation

Tools