SlideGuitarist: October 2011

I was recently confronted with some XML structured in this way: a <person> at the top, followed by 0 or more entities to whom the person has a certain sort of relationship. These may include other persons, and the same persons may appear in several input documents. I’d like to split up all these documents, and apply additional XSLTs to them. Moreover, I can only decide on where the output XML is going to go based on information not available to the XSLT processor, i.e. I can’t simply calculate a URI in <xsl:result-document> and let the processor open the file for me. This is the first time I’ve used the Saxon processor’s setOutputURIResolver() method; in fact, I didn’t know it existed until I did some hunting in Eclipse.

Here’s what the input looks like, more or less:

<?xml version="1.0"?>
<person-with-relationships>
 <person id="1">
  <name>Anthony Albert Nassar</name>
  <phone-number>800-555-1212</phone-number>
 </person>
 <relationships>
  <relationship>
   <employment>
    <start-date/>
   </employment>
   <organization id="2">
    <organization-name>Palantir Technologies, Inc.</organization-name>
    <url>palantir.com</url>
   </organization>
  </relationship>
  <relationship>
   <marriage>
    <start-date/>
   </marriage>
   <person id="3">
    <name>Donavan Arizmendi</name>
   </person>
  </relationship>
 </relationships>
</person-with-relationships>

The XSLT looks like this:

<?xml version="1.0"?>
<xsl:stylesheet version="2.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform" xpath-default-namespace="">
 <xsl:import href="identity.xslt"/>

 <xsl:template match="relationship/*[2]">
  <xsl:message>Opening document for <xsl:value-of select="local-name()"/> with ID <xsl:value-of select="@id"/></xsl:message>
  <xsl:result-document href="{@id}.xml">
   <xsl:apply-imports/>
  </xsl:result-document>
 </xsl:template>

  <xsl:template match="/person-with-relationships/person">
  <xsl:message>Opening document for person with ID <xsl:value-of select="@id"/></xsl:message>
  <xsl:result-document href="{@id}.xml">
   <!-- Invoke the identity template, i.e. just copy this subtree to the output. -->
   <!-- If you have some local template with lower priority that you'd like to 
    invoke, use <xsl:next-match/>
   -->
   <xsl:apply-imports/>
  </xsl:result-document>
 </xsl:template>


</xsl:stylesheet>

Let’s say that I want to turn each output in a DOM, or a dom4j Document, before I do anything else with it, i.e. I can’t just write the output to files. Moreover, I want to avoid overwriting files I’ve already created, and I may need to aggregate information from all the inputs in a way not suited to XSLT (I could do some of this in XQuery…but I digress). The Java for this purpose might look like the following. I’m using dom4j to set the output subtrees aside. I could have used DOM (quel horreur), or, probably, something in Saxon, but I actually wanted to stay closer to JAXP. So:

package com.demo.xml;

import java.io.File;
import java.io.IOException;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.ConcurrentMap;

import javax.xml.transform.Result;
import javax.xml.transform.Templates;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerException;
import javax.xml.transform.sax.TransformerHandler;
import javax.xml.transform.stream.StreamResult;
import javax.xml.transform.stream.StreamSource;

import net.sf.saxon.Controller;
import net.sf.saxon.FeatureKeys;
import net.sf.saxon.OutputURIResolver;
import net.sf.saxon.TransformerFactoryImpl;
import net.sf.saxon.event.SequenceWriter;
import net.sf.saxon.om.Item;
import net.sf.saxon.trans.XPathException;

import org.apache.commons.io.FileUtils;
import org.dom4j.Document;
import org.dom4j.io.DocumentResult;
import org.dom4j.io.DocumentSource;

import com.google.common.io.NullOutputStream;

public class InputSplitter {
       private final Templates splitterTemplates;

       // http://dhruba.name/2009/08/05/concurrent-set-implementations-in-java-6/
       private final ConcurrentMap<String,DocumentResult> urisProcessed = new ConcurrentHashMap<String,DocumentResult>();

       private final TransformerFactoryImpl factory;

       public InputSplitter() throws TransformerException {
              factory = new TransformerFactoryImpl();
              // I also have the requirement of removing elements with only 
              // whitespace nodes among their descendants. This attribute
              // lets the parser throw away such whitespace nodes. The 
              // XPath expression to discard elements with no content 
              // then becomes trivial.
              factory.setAttribute(FeatureKeys.STRIP_WHITESPACE, "all");
              File splitterXlstFile = new File("resources/splitter.xsl");
              // Calling newTemplates(), rather than newTransformer(), gives me 
              // on thread-safe object that I can use repeatedly. Each time I 
              // want to transform an input, I have to create a new Transformer.
              this.splitterTemplates = factory.newTemplates(new StreamSource(splitterXlstFile));
       }

       public void splitFile(File xmlFile) throws TransformerException {
              final StreamSource xmlSource = new StreamSource(xmlFile);

              TransformerHandler handler = factory.newTransformerHandler(splitterTemplates);
              Transformer transformer = handler.getTransformer();
              Controller controller = (Controller) transformer;
              // You might not want an anonymous implementation of OutputURIResolver,
              // but that's irrelevant to the example. In any case, this is Saxon's
              // back door to <xsl:result-document>.
              controller.setOutputURIResolver(new OutputURIResolver() {
                     @Override
                     public void close(Result result) throws TransformerException {
                           // If you opened a Stream in resolve(), you'd want to close it
                           // here.

                     @Override
                     public Result resolve(String href, String base) throws TransformerException {
                           DocumentResult result = new DocumentResult();
                           DocumentResult existingResult = urisProcessed.putIfAbsent(href, result);

                           if (existingResult == null) {

                                  return result;
                           } else {
                                  // Throw the results away. There might be a way to implement 
                                  // a null SAXResult, but I'll leave that as an exercise for the 
                                  // reader.
                                  return new StreamResult(new NullOutputStream());
                           }
                          
                     }});

              
              controller.setMessageEmitter(new SequenceWriter() {

                     @Override
                     public void write(Item item) throws XPathException {
                           System.out.println(item.getStringValue())

                    }});
              // Discard the output from the entire document.
              transformer.transform(xmlSource, new StreamResult(new NullOutputStream()));
       }

       public void transformFolder(File folder) throws TransformerException {
              for (File xmlFile : folder.listFiles()) {
                     splitFile(xmlFile);
              }
       }

       static public void main(String[] args) throws TransformerException, IOException {
              InputSplitter splitter = new InputSplitter();
              
              assert args.length == 2;
              File inputXml = new File(args[0]);
              splitter.splitFile(inputXml);
              
              final File outputDirectory = new File(args[1]);

              if (!outputDirectory.mkdirs())
                     FileUtils.cleanDirectory(outputDirectory);

              for (String entry : splitter.urisProcessed.keySet()) {
                     File outputFile = new File(outputDirectory, entry);
                     // Use the identity transform to turn the dom4j tree into a file.
                     Transformer newTransformer = splitter.factory.newTransformer();
                     newTransformer.setOutputProperty("indent", "yes");
                     Document document = splitter.urisProcessed.get(entry).getDocument();
                     newTransformer.transform(new DocumentSource(document), new StreamResult(outputFile));
              }
        }
}

SlideGuitarist

Saturday, October 01, 2011

XSLT to Produce Multiple Results from a Single Input

XSLT Resources

Functional Programming

Blog Archive

Technical Resources

Tech Classics

Arts & Letters

About Me