List words.xslt
Jump to navigation
Jump to search
This is a relatively simple example of a stylesheet that will generate a list of words, ostensibly for text analysis. It is not intended to be used unmodified. The values of type= attributes of <div>, for example, should be changed to match your project's. Note that this stylesheet does not handle end-of-line hyphenated words properly. You should run your document through something like Unshy.xslt first.
<xsl:stylesheet version="1.0"
xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
xmlns:in="http://www.tei-c.org/ns/1.0">
<!--
Read in a TEI P5 file, and spit out a text list of words that
subsequent analysis may find useful.
Written 2012-07-18 by Syd Bauman, based entirely on WWP's dev/only_words.xslt
Copyleft 2010 by Syd Bauman and the Brown WWP
-->
<!-- First, tell the XSLT engine that we’d like to spit out plain text (instead of -->
<!-- the default, which is XML) -->
<xsl:output method="text"/>
<!-- Ascertain whether the user wants case-folding or not. Default is yes -->
<xsl:param name="case-fold" select="true()"/>
<!-- Match the document root (which itself contains at least the root element, and probably -->
<!-- some comments and processing instructions, too) -->
<xsl:template match="/">
<!-- In our first pass through the document tree, we don't spit the output out, but -->
<!-- rather save it in a variable for further processing. -->
<xsl:variable name="documentData">
<!-- process the TEI document or documents, whether we're reading a corpus or not -->
<xsl:apply-templates select="/in:TEI | /in:teiCorpus/in:TEI"/>
</xsl:variable>
<!-- Now we have all of the “words” in the variable, but we also have lots of -->
<!-- whitespace, some of it quite weird. What we’d like is just a list, so -->
<!-- convert it. First, reduce all sequences of one or more whitespace characters -->
<!-- to a single blank; then turn all blanks to newlines. -->
<xsl:value-of select="translate(normalize-space($documentData),' ','
')"/>
</xsl:template>
<!-- For any element that isn’t matched in a specific template, below, -->
<!-- just process all of its element and textual children (thus ignoring -->
<!-- any comments or processing instructions). Note that this template -->
<!-- catches the in:TEI from the root template, above. -->
<xsl:template match="*">
<xsl:apply-templates select="*|text()"/>
</xsl:template>
<!-- For any text nodes we process, convert punctuation to a space and -->
<!-- spit out the result. (Remember that what we spit out is going to -->
<!-- the variable $documentData, not directly to output.) -->
<xsl:template match="text()">
<xsl:variable name="no-punctuation"
select="translate( ., '“”,.;:?!()-–—…"',
' ')"/>
<xsl:choose>
<xsl:when test="$case-fold">
<xsl:value-of select="translate( $no-punctuation,
'ABCDEFGHIJKLMNOPQRSTUVWXYZ',
'abcdefghijklmnopqrstuvwxyz')"/>
<!-- Note: if you are using alphabets other than the standard -->
<!-- English 26, either add letters above or convert stylesheet -->
<!-- to XSLT 2.0 (or 3.0) and use the lower-case() function -->
</xsl:when>
<xsl:otherwise>
<xsl:value-of select="$no-punctuation"/>
</xsl:otherwise>
</xsl:choose>
</xsl:template>
<!-- Some constructs are summarily ignored: the entire element, including -->
<!-- any text content or sub-elements, is just thrown away. -->
<xsl:template match="
in:mw | in:fw
| in:teiHeader
| in:back//in:div[@type='editorial']
| in:div[@type='advert']
| in:div[@type='castlist']
| in:div[@type='colophon']
| in:div[@type='contents']
| in:div[@type='corrigenda']
| in:div[@type='index']
| in:abbr[parent::in:choice/in:expan]
| in:sic[parent::in:choice/in:corr]
| in:orig[parent::in:choice/in:reg]
| in:am[parent::in:choice/in:ex]
| in:unclear[parent::in:choice and preceding-sibling::in:unclear]
| in:add
| in:del
| in:note[not( @type='authorial' or @resp='author' )]
| in:figure
| in:stage
| in:docImprint
| in:bibl
| in:dateline
| in:respLine
| comment()
| processing-instruction()
| @*
">
<!-- It's not really necessary to catch comments, processing instructions, and -->
<!-- attributes with this null template, as they will never be processed anyway, -->
<!-- because we only process elements and text nodes. But it seems like a good -->
<!-- idea to be explicit: nuke 'em. -->
</xsl:template>
<!-- Some elements “cause” a word-break, so insert a blank for them -->
<xsl:template match="
in:cb
| in:lb
| in:pb
| in:milestone
| in:div
| in:p
">
<xsl:text> </xsl:text>
<xsl:apply-templates select="*|text()"/>
</xsl:template>
</xsl:stylesheet>