Difference between revisions of "Milestone-chunk.xquery"
DavidSewell (talk | contribs) (Created) |
(modifying original script to include xml attributes and empty elements like handShift) |
||
(12 intermediate revisions by 7 users not shown) | |||
Line 109: | Line 109: | ||
else if ( $node >> $ms1 and $node << $ms2 ) then $node | else if ( $node >> $ms1 and $node << $ms2 ) then $node | ||
else () | else () | ||
− | case attribute() return | + | case attribute() return $node (: will never match attributes outside non-returned elements :) |
− | |||
default return | default return | ||
if ( $node >> $ms1 and $node << $ms2 ) then $node | if ( $node >> $ms1 and $node << $ms2 ) then $node | ||
Line 117: | Line 116: | ||
</nowiki></pre> | </nowiki></pre> | ||
− | [[Category: | + | == Implementation in eXist == |
+ | A string-based function to return fragments between two nodes is implemented in [[eXist]] (as a function called [http://exist-db.org/exist/apps/fundocs/view.html?uri=http://exist-db.org/xquery/util&location=java:org.exist.xquery.functions.util.UtilModule&details=true#get-fragment-between.4 util:get-fragment-between()]) by Josef Willenborg, Max Planck Institute for the History of Science [mailto:jwillenborg@mpiwg-berlin.mpg.de jwillenborg@mpiwg-berlin.mpg.de]: | ||
+ | |||
+ | <pre><nowiki> | ||
+ | util:get-fragment-between($beginning-node as node()?, $ending-node as node()?, $make-fragment as xs:boolean?) xs:string | ||
+ | </nowiki></pre> | ||
+ | |||
+ | Returns an xml fragment or a sequence of nodes between two elements (normally milestone elements). The $beginning-node represents the first node/milestone element, $ending-node, the second one. The third argument, $make-fragment, is a boolean value for the path completion. If it is set to true() the result sequence is wrapped into a parent element node. | ||
+ | |||
+ | Example call of the function for getting the fragment between two TEI page break element nodes: | ||
+ | <pre><nowiki> | ||
+ | let $fragment := util:get-fragment-between(//pb[1], //pb[2], true()) | ||
+ | </nowiki></pre> | ||
+ | |||
+ | == Extended version which keeps the namespace declaration == | ||
+ | A short explanation of the problem [http://sourceforge.net/mailarchive/forum.php?thread_name=CAKKtb%2Buzzd0RYJHfcvpz6z6ogUwBcbXhPDkk%2Bahq3YAaF0u_ig%40mail.gmail.com&forum_name=exist-open] | ||
+ | <pre><nowiki> | ||
+ | declare function local:milestone-chunk-ns( | ||
+ | $ms1 as element(), | ||
+ | $ms2 as element(), | ||
+ | $node as node()* | ||
+ | ) as node()* | ||
+ | { | ||
+ | typeswitch ($node) | ||
+ | case element() return | ||
+ | if ($node is $ms1) then $node | ||
+ | else if ( some $n in $node/descendant::* satisfies ($n is $ms1 or $n is $ms2) ) | ||
+ | then | ||
+ | (: element { name($node) } :) | ||
+ | element {QName (namespace-uri($node), name($node))} | ||
+ | { for $i in ( $node/node() | $node/@* ) | ||
+ | return local:milestone-chunk-ns($ms1, $ms2, $i) } | ||
+ | else if ( $node >> $ms1 and $node << $ms2 ) then $node | ||
+ | else () | ||
+ | case attribute() return $node (: will never match attributes outside non-returned elements :) | ||
+ | default return | ||
+ | if ( $node >> $ms1 and $node << $ms2 ) then $node | ||
+ | else () | ||
+ | }; | ||
+ | </nowiki></pre> | ||
+ | |||
+ | == Extended version which maintains xml attributes in scope and listed preceding empty elements == | ||
+ | <pre><nowiki> | ||
+ | xquery version "1.0"; | ||
+ | |||
+ | declare namespace tei="http://www.tei-c.org/ns/1.0"; | ||
+ | |||
+ | declare function local:get-common-ancestor($element as element(), $start-node as node(), $end-node as node()) | ||
+ | as element() | ||
+ | { | ||
+ | let $element := | ||
+ | ($element//*[. is $start-node]/ancestor::* intersect $element//*[. is $end-node]/ancestor::*)[last()] | ||
+ | return | ||
+ | $element | ||
+ | }; | ||
+ | |||
+ | declare function local:get-fragment( | ||
+ | $node as node()*, | ||
+ | $start-node as element(), | ||
+ | $end-node as element(), | ||
+ | $include-start-and-end-nodes as xs:boolean, | ||
+ | $empty-ancestor-elements-to-include as xs:string+ | ||
+ | ) as node()* | ||
+ | { | ||
+ | typeswitch ($node) | ||
+ | case element() return | ||
+ | if ($node is $start-node or $node is $end-node) | ||
+ | then | ||
+ | if ($include-start-and-end-nodes) | ||
+ | then $node | ||
+ | else () | ||
+ | else | ||
+ | if (some $node in $node/descendant::* satisfies ($node is $start-node or $node is $end-node)) | ||
+ | then | ||
+ | element {node-name($node)} | ||
+ | { | ||
+ | (:the xml attributes that govern their descendants are carried over to the fragment; | ||
+ | if the fragment has several layers before it reaches text nodes, this information is duplicated, but this does no harm:) | ||
+ | if ($node/@xml:base) | ||
+ | then attribute{'xml:base'}{$node/@xml:base} | ||
+ | else | ||
+ | if ($node/ancestor::*/@xml:base) | ||
+ | then attribute{'xml:base'}{$node/ancestor::*/@xml:base[1]} | ||
+ | else (), | ||
+ | if ($node/@xml:space) | ||
+ | then attribute{'xml:space'}{$node/@xml:space} | ||
+ | else | ||
+ | if ($node/ancestor::*/@xml:space) | ||
+ | then attribute{'xml:space'}{$node/ancestor::*/@xml:space[1]} | ||
+ | else (), | ||
+ | if ($node/@xml:lang) | ||
+ | then attribute{'xml:lang'}{$node/@xml:lang} | ||
+ | else | ||
+ | if ($node/ancestor::*/@xml:lang) | ||
+ | then attribute{'xml:lang'}{$node/ancestor::*/@xml:lang[1]} | ||
+ | else () | ||
+ | , | ||
+ | (:carry over the nearest of preceding empty elements that have significance for the fragment; though amy element could be included here, the idea is to allow empty elements such as handShift to be carried over:) | ||
+ | for $empty-ancestor-element-to-include in $empty-ancestor-elements-to-include | ||
+ | return | ||
+ | $node/preceding::*[local-name(.) = $empty-ancestor-element-to-include][1] | ||
+ | , | ||
+ | (:recurse:) | ||
+ | for $node in $node/node() | ||
+ | return local:get-fragment($node, $start-node, $end-node, $include-start-and-end-nodes, $empty-ancestor-elements-to-include) } | ||
+ | else | ||
+ | (:if an element follows the start-node or precedes the end-note, carry it over:) | ||
+ | if ($node >> $start-node and $node << $end-node) | ||
+ | then $node | ||
+ | else () | ||
+ | default return | ||
+ | (:if a text, comment or PI node follows the start-node or precedes the end-node, carry it over:) | ||
+ | if ($node >> $start-node and $node << $end-node) | ||
+ | then $node | ||
+ | else () | ||
+ | }; | ||
+ | |||
+ | declare function local:get-fragment-from-doc( | ||
+ | $node as node()*, | ||
+ | $start-node as element(), | ||
+ | $end-node as element(), | ||
+ | $wrap-in-first-common-ancestor-only as xs:boolean, | ||
+ | $include-start-and-end-nodes as xs:boolean, | ||
+ | $empty-ancestor-elements-to-include as xs:string+ | ||
+ | ) as node()* | ||
+ | { | ||
+ | if ($node instance of element()) | ||
+ | then | ||
+ | let $node := | ||
+ | if ($wrap-in-first-common-ancestor-only) | ||
+ | then local:get-common-ancestor($node, $start-node, $end-node) | ||
+ | else $node | ||
+ | return | ||
+ | local:get-fragment($node, $start-node, $end-node, $include-start-and-end-nodes, $empty-ancestor-elements-to-include) | ||
+ | else | ||
+ | if ($node instance of document-node()) | ||
+ | then local:get-fragment-from-doc($node/element(), $start-node, $end-node, $wrap-in-first-common-ancestor-only, $include-start-and-end-nodes, $empty-ancestor-elements-to-include) | ||
+ | else () | ||
+ | |||
+ | }; | ||
+ | |||
+ | let $input := doc('/db/eebo/A00283.xml') | ||
+ | |||
+ | return | ||
+ | local:get-fragment-from-doc($input, $input//tei:pb[@n="7"], $input//tei:pb[@n="8"], true(), true(), ('handShift')) | ||
+ | </nowiki></pre> | ||
+ | |||
+ | |||
+ | [[Category:XQuery]] |
Latest revision as of 11:52, 24 June 2015
Contents
Authorship
Author | David Sewell, University of Virginia, dsewell@virginia.edu |
Last revised | 2007-05-02 |
Previous version | none |
Summary
This is an XQuery 1.0 function that will return all of the content between two milestone elements such as pb while preserving the hierarchical structure of the containing elements. For example, given content like this in a TEI document:
<TEI> <text> <body> <div1 type="chapter" n="1"> <!-- lots of div2s --> <div2 xml:id="doc100"> <p>An example<pb n="3"/>of a <i>very</i> short page<pb n="4"/>here.</p> </div2> <!-- followed by lots of other stuff --> </div1> </body> </text> </TEI>
the function would produce the following XML fragment as output when asked to return content between pb/@n=3 and pb/@n=4 with the text element as the ancestor:
<text> <body> <div1 type="chapter" n="1"> <div2 xml:id="doc100"> <p><pb n="3"/>of a <i>very</i> short page</p> </div2> </div1> </body> </text>
In other words, the full hierarchical structure of the ancestor elements, including their attributes, is preserved, but only the nodal content between the milestons is included.
Required Input
The function signature is
local:milestone-chunk( $ms1 as element(), $ms2 as element(), $node as node()* )
$node is an element known to be a common ancestor of the two milestones. For example, it could be TEI/text, or TEI/text/body, or even TEI/text/body/div1[3] if the milestone parameters are both descendants of that div1.
$ms1 is the first milestone element; $ms2 is the second milestone element.
For example, the output in "Summary" above might have been produced by the call
let $input := doc("mydoc.xml")/tei:TEI/tei:text return local:milestone-chunk($input//pb[@n="3"], $input//pb[@n="4"], $input)
If $input had been
doc("mydoc.xml")/tei:TEI/tei:text/tei:body
then the output would have started at the body element, etc.
$ms1 and $ms2 do not need to be adjacent milestones. You can, for example, return content between pb/@n=4 and pb/@n=7. Nor do the milestones need to be of the same type; you can return content between the pb for page 4 and an arbitrary anchor or pointer element later in the document.
Expected Output
As indicated in the example in "Summary" above, the output should be a single XML element reflecting the structure of the input ancestor element and its descendants, but otherwise containing only the nodal content between the two milestone elements in the original input.
Known Restrictions or Problems
The output will contain a copy of the first milestone element, but not of the second (closing) one. This is a consequence of the need to use pseudo-milestones for the second milestone in some cases; see next paragraph.
When using this function to generate content between TEI pb elements, an obvious problem is that the final page will not have a final pb milestone. In this case, use a pseudo-milestone such as the last node in the input element. For example, using the sample document in "Summary" above, the content of page 4 ("here.") would be output via the call
let $input := doc("mydoc.xml")/tei:TEI/tei:text local:milestone-chunk($input//pb[@n="4"], ($input//node())[last()], $input)
If you use this function to recurse over all the pb elements in a document, you will need to use a strategy like this when $ms1 has no following pb element.
The function will not gracefully handle invalid input, but will probably throw run-time errors. You should be sure that the parameters passed to it reflect actual milestone elements with a common ancestor.
Code
declare function local:milestone-chunk( $ms1 as element(), $ms2 as element(), $node as node()* ) as node()* { typeswitch ($node) case element() return if ($node is $ms1) then $node else if ( some $n in $node/descendant::* satisfies ($n is $ms1 or $n is $ms2) ) then element { name($node) } { for $i in ( $node/node() | $node/@* ) return local:milestone-chunk($ms1, $ms2, $i) } else if ( $node >> $ms1 and $node << $ms2 ) then $node else () case attribute() return $node (: will never match attributes outside non-returned elements :) default return if ( $node >> $ms1 and $node << $ms2 ) then $node else () };
Implementation in eXist
A string-based function to return fragments between two nodes is implemented in eXist (as a function called util:get-fragment-between()) by Josef Willenborg, Max Planck Institute for the History of Science jwillenborg@mpiwg-berlin.mpg.de:
util:get-fragment-between($beginning-node as node()?, $ending-node as node()?, $make-fragment as xs:boolean?) xs:string
Returns an xml fragment or a sequence of nodes between two elements (normally milestone elements). The $beginning-node represents the first node/milestone element, $ending-node, the second one. The third argument, $make-fragment, is a boolean value for the path completion. If it is set to true() the result sequence is wrapped into a parent element node.
Example call of the function for getting the fragment between two TEI page break element nodes:
let $fragment := util:get-fragment-between(//pb[1], //pb[2], true())
Extended version which keeps the namespace declaration
A short explanation of the problem [1]
declare function local:milestone-chunk-ns( $ms1 as element(), $ms2 as element(), $node as node()* ) as node()* { typeswitch ($node) case element() return if ($node is $ms1) then $node else if ( some $n in $node/descendant::* satisfies ($n is $ms1 or $n is $ms2) ) then (: element { name($node) } :) element {QName (namespace-uri($node), name($node))} { for $i in ( $node/node() | $node/@* ) return local:milestone-chunk-ns($ms1, $ms2, $i) } else if ( $node >> $ms1 and $node << $ms2 ) then $node else () case attribute() return $node (: will never match attributes outside non-returned elements :) default return if ( $node >> $ms1 and $node << $ms2 ) then $node else () };
Extended version which maintains xml attributes in scope and listed preceding empty elements
xquery version "1.0"; declare namespace tei="http://www.tei-c.org/ns/1.0"; declare function local:get-common-ancestor($element as element(), $start-node as node(), $end-node as node()) as element() { let $element := ($element//*[. is $start-node]/ancestor::* intersect $element//*[. is $end-node]/ancestor::*)[last()] return $element }; declare function local:get-fragment( $node as node()*, $start-node as element(), $end-node as element(), $include-start-and-end-nodes as xs:boolean, $empty-ancestor-elements-to-include as xs:string+ ) as node()* { typeswitch ($node) case element() return if ($node is $start-node or $node is $end-node) then if ($include-start-and-end-nodes) then $node else () else if (some $node in $node/descendant::* satisfies ($node is $start-node or $node is $end-node)) then element {node-name($node)} { (:the xml attributes that govern their descendants are carried over to the fragment; if the fragment has several layers before it reaches text nodes, this information is duplicated, but this does no harm:) if ($node/@xml:base) then attribute{'xml:base'}{$node/@xml:base} else if ($node/ancestor::*/@xml:base) then attribute{'xml:base'}{$node/ancestor::*/@xml:base[1]} else (), if ($node/@xml:space) then attribute{'xml:space'}{$node/@xml:space} else if ($node/ancestor::*/@xml:space) then attribute{'xml:space'}{$node/ancestor::*/@xml:space[1]} else (), if ($node/@xml:lang) then attribute{'xml:lang'}{$node/@xml:lang} else if ($node/ancestor::*/@xml:lang) then attribute{'xml:lang'}{$node/ancestor::*/@xml:lang[1]} else () , (:carry over the nearest of preceding empty elements that have significance for the fragment; though amy element could be included here, the idea is to allow empty elements such as handShift to be carried over:) for $empty-ancestor-element-to-include in $empty-ancestor-elements-to-include return $node/preceding::*[local-name(.) = $empty-ancestor-element-to-include][1] , (:recurse:) for $node in $node/node() return local:get-fragment($node, $start-node, $end-node, $include-start-and-end-nodes, $empty-ancestor-elements-to-include) } else (:if an element follows the start-node or precedes the end-note, carry it over:) if ($node >> $start-node and $node << $end-node) then $node else () default return (:if a text, comment or PI node follows the start-node or precedes the end-node, carry it over:) if ($node >> $start-node and $node << $end-node) then $node else () }; declare function local:get-fragment-from-doc( $node as node()*, $start-node as element(), $end-node as element(), $wrap-in-first-common-ancestor-only as xs:boolean, $include-start-and-end-nodes as xs:boolean, $empty-ancestor-elements-to-include as xs:string+ ) as node()* { if ($node instance of element()) then let $node := if ($wrap-in-first-common-ancestor-only) then local:get-common-ancestor($node, $start-node, $end-node) else $node return local:get-fragment($node, $start-node, $end-node, $include-start-and-end-nodes, $empty-ancestor-elements-to-include) else if ($node instance of document-node()) then local:get-fragment-from-doc($node/element(), $start-node, $end-node, $wrap-in-first-common-ancestor-only, $include-start-and-end-nodes, $empty-ancestor-elements-to-include) else () }; let $input := doc('/db/eebo/A00283.xml') return local:get-fragment-from-doc($input, $input//tei:pb[@n="7"], $input//tei:pb[@n="8"], true(), true(), ('handShift'))