GraphUtil.java

/*
 * Copyright (c) 2007-2017 MetaSolutions AB
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.entrystore.rest.util;

import org.eclipse.rdf4j.common.xml.XMLReaderFactory;
import org.eclipse.rdf4j.model.Literal;
import org.eclipse.rdf4j.model.Model;
import org.eclipse.rdf4j.model.Statement;
import org.eclipse.rdf4j.model.Value;
import org.eclipse.rdf4j.model.impl.LinkedHashModel;
import org.eclipse.rdf4j.rio.ParserConfig;
import org.eclipse.rdf4j.rio.RDFFormat;
import org.eclipse.rdf4j.rio.RDFHandler;
import org.eclipse.rdf4j.rio.RDFHandlerException;
import org.eclipse.rdf4j.rio.RDFParseException;
import org.eclipse.rdf4j.rio.RDFParser;
import org.eclipse.rdf4j.rio.RDFWriter;
import org.eclipse.rdf4j.rio.helpers.BasicWriterSettings;
import org.eclipse.rdf4j.rio.helpers.JSONLDMode;
import org.eclipse.rdf4j.rio.helpers.JSONLDSettings;
import org.eclipse.rdf4j.rio.helpers.StatementCollector;
import org.eclipse.rdf4j.rio.helpers.XMLParserSettings;
import org.eclipse.rdf4j.rio.jsonld.JSONLDParser;
import org.eclipse.rdf4j.rio.jsonld.JSONLDWriter;
import org.eclipse.rdf4j.rio.n3.N3ParserFactory;
import org.eclipse.rdf4j.rio.n3.N3Writer;
import org.eclipse.rdf4j.rio.ntriples.NTriplesParser;
import org.eclipse.rdf4j.rio.ntriples.NTriplesWriter;
import org.eclipse.rdf4j.rio.rdfxml.RDFXMLParser;
import org.eclipse.rdf4j.rio.rdfxml.util.RDFXMLPrettyWriter;
import org.eclipse.rdf4j.rio.trig.TriGParser;
import org.eclipse.rdf4j.rio.trig.TriGWriter;
import org.eclipse.rdf4j.rio.trix.TriXParser;
import org.eclipse.rdf4j.rio.trix.TriXWriter;
import org.eclipse.rdf4j.rio.turtle.TurtleParser;
import org.eclipse.rdf4j.rio.turtle.TurtleWriter;
import org.entrystore.repository.util.NS;
import org.json.JSONObject;
import org.restlet.data.MediaType;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.xml.sax.SAXException;
import org.xml.sax.XMLReader;

import java.io.IOException;
import java.io.StringReader;
import java.io.StringWriter;
import java.io.Writer;
import java.lang.reflect.Constructor;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

/**
 * Utility methods to serialize and deserialize graphs.
 *
 * @author Hannes Ebner
 */
public class GraphUtil {

	private final static Logger log = LoggerFactory.getLogger(GraphUtil.class);

	private final static List<MediaType> supportedMediaTypes = new ArrayList<>();

	static {
		supportedMediaTypes.add(MediaType.APPLICATION_RDF_XML);
		supportedMediaTypes.add(MediaType.APPLICATION_JSON);
		supportedMediaTypes.add(MediaType.TEXT_RDF_N3);
		supportedMediaTypes.add(new MediaType(RDFFormat.TURTLE.getDefaultMIMEType()));
		supportedMediaTypes.add(new MediaType(RDFFormat.TRIX.getDefaultMIMEType()));
		supportedMediaTypes.add(new MediaType(RDFFormat.NTRIPLES.getDefaultMIMEType()));
		supportedMediaTypes.add(new MediaType(RDFFormat.TRIG.getDefaultMIMEType()));
		supportedMediaTypes.add(new MediaType(RDFFormat.JSONLD.getDefaultMIMEType()));
		supportedMediaTypes.add(new MediaType("application/rdf+json"));
	}

	/**
	 * @param graph  The Graph to be serialized.
	 * @param writer One of the following: N3Writer, NTriplesWriter,
	 *               RDFXMLPrettyWriter, RDFXMLWriter, TriGWriter, TriXWriter,
	 *               TurtleWriter
	 * @return A String representation of the serialized Graph.
	 */
	public static String serializeGraph(Model graph, Class<? extends RDFWriter> writer) {
		if (graph == null || writer == null) {
			throw new IllegalArgumentException("Parameters must not be null");
		}

		StringWriter stringWriter = new StringWriter();
		Map<String, String> namespaces = NS.getMap();
		RDFWriter rdfWriter = null;
		try {
			Constructor<? extends RDFWriter> constructor = writer.getConstructor(Writer.class);
			rdfWriter = constructor.newInstance(stringWriter);

			if (!System.getProperties().containsKey("org.eclipse.rdf4j.rio.rdf10_plain_literals")) {
				rdfWriter.getWriterConfig().set(BasicWriterSettings.XSD_STRING_TO_PLAIN_LITERAL, true);
			}
			if (!System.getProperties().containsKey("org.eclipse.rdf4j.rio.rdf10_language_literals")) {
				rdfWriter.getWriterConfig().set(BasicWriterSettings.RDF_LANGSTRING_TO_LANG_LITERAL, true);
			}
			if (!System.getProperties().containsKey("org.eclipse.rdf4j.rio.jsonld.optimize")) {
				rdfWriter.getWriterConfig().set(JSONLDSettings.OPTIMIZE, true);
			}
			if (!System.getProperties().containsKey("org.eclipse.rdf4j.rio.jsonld.use_native_types")) {
				rdfWriter.getWriterConfig().set(JSONLDSettings.USE_NATIVE_TYPES, true);
			}
			rdfWriter.getWriterConfig().set(JSONLDSettings.JSONLD_MODE, JSONLDMode.COMPACT);

			if (rdfWriter instanceof JSONLDWriter) {
				// we optimize to include only the used namespaces as contexts in JSON-LD
				namespaces = new HashMap<>();
				for (Statement s : graph) {
					namespaces.putAll(findNS(s.getSubject()));
					namespaces.putAll(findNS(s.getPredicate()));
					namespaces.putAll(findNS(s.getObject()));
				}
			}
		} catch (Exception e) {
			log.error(e.getMessage());
		}

		if (rdfWriter == null) {
			return null;
		}

		try {
			rdfWriter.startRDF();
			for (String nsName : namespaces.keySet()) {
				rdfWriter.handleNamespace(nsName, namespaces.get(nsName));
			}
			for (Statement statement : graph) {
				rdfWriter.handleStatement(statement);
			}
			rdfWriter.endRDF();
		} catch (RDFHandlerException rdfe) {
			log.error(rdfe.getMessage());
		}
		return stringWriter.toString();
	}

	public static void serializeGraph(Model graph, RDFWriter rdfWriter) {
		if (graph == null || rdfWriter == null) {
			throw new IllegalArgumentException("Parameters must not be null");
		}
		try {
			rdfWriter.startRDF();
			Map<String, String> namespaces = NS.getMap();
			for (String nsName : namespaces.keySet()) {
				rdfWriter.handleNamespace(nsName, namespaces.get(nsName));
			}
			for (Statement statement : graph) {
				rdfWriter.handleStatement(statement);
			}
			rdfWriter.endRDF();
		} catch (RDFHandlerException rdfe) {
			log.error(rdfe.getMessage());
		}
	}

	/**
	 * @param serializedGraph The Graph to be deserialized.
	 * @param parser          Instance of the following: N3Parser, NTriplesParser,
	 *                        RDFXMLParser, TriGParser, TriXParser, TurtleParser
	 * @return A String representation of the serialized Graph.
	 */
	public static Model deserializeGraph(String serializedGraph, RDFParser parser) {
		try {
			return deserializeGraphUnsafe(serializedGraph, parser);
		} catch (RDFHandlerException | RDFParseException | IOException e) {
			log.error(e.getMessage());
			return null;
		}
	}

	public static Model deserializeGraphUnsafe(String serializedGraph, RDFParser parser) throws RDFParseException, RDFHandlerException, IOException {
		if (serializedGraph == null || parser == null) {
			throw new IllegalArgumentException("Parameters must not be null");
		}

		StringReader reader = new StringReader(serializedGraph);
		StatementCollector collector = new StatementCollector();
		parser.setRDFHandler(collector);
		parser.parse(reader, "");

		return new LinkedHashModel(collector.getStatements());
	}

	public static Model deserializeGraph(String graphString, MediaType mediaType) {
		try {
			return deserializeGraphUnsafe(graphString, mediaType);
		} catch (RDFHandlerException | RDFParseException | IOException e) {
			log.error(e.getMessage());
		}
		return null;
	}

	public static Model deserializeGraphUnsafe(String graphString, MediaType mediaType) throws RDFHandlerException, IOException, RDFParseException {
		Model deserializedGraph = null;
		if (mediaType.equals(MediaType.APPLICATION_JSON) || mediaType.getName().equals("application/rdf+json")) {
			deserializedGraph = RDFJSON.rdfJsonToGraph(graphString);
		} else if (mediaType.equals(MediaType.APPLICATION_RDF_XML)) {
			RDFXMLParser rdfXmlParser = new RDFXMLParser();
			rdfXmlParser.setParserConfig(constructSafeXmlParserConfig());
			deserializedGraph = deserializeGraphUnsafe(graphString, rdfXmlParser);
		} else if (mediaType.equals(MediaType.TEXT_RDF_N3)) {
			deserializedGraph = deserializeGraphUnsafe(graphString, new N3ParserFactory().getParser());
		} else if (mediaType.getName().equals(RDFFormat.TURTLE.getDefaultMIMEType())) {
			deserializedGraph = deserializeGraphUnsafe(graphString, new TurtleParser());
		} else if (mediaType.getName().equals(RDFFormat.TRIX.getDefaultMIMEType())) {
			TriXParser trixParser = new TriXParser();
			trixParser.setParserConfig(constructSafeXmlParserConfig());
			deserializedGraph = deserializeGraphUnsafe(graphString, trixParser);
		} else if (mediaType.getName().equals(RDFFormat.NTRIPLES.getDefaultMIMEType())) {
			deserializedGraph = deserializeGraphUnsafe(graphString, new NTriplesParser());
		} else if (mediaType.getName().equals(RDFFormat.TRIG.getDefaultMIMEType())) {
			deserializedGraph = deserializeGraphUnsafe(graphString, new TriGParser());
		} else if (mediaType.getName().equals(RDFFormat.JSONLD.getDefaultMIMEType())) {
			deserializedGraph = deserializeGraphUnsafe(graphString, new JSONLDParser());
		}
		return deserializedGraph;
	}

	public static String serializeGraph(Model graph, MediaType mediaType) {
		String serializedGraph;
		if (mediaType.equals(MediaType.APPLICATION_JSON) || mediaType.getName().equals("application/rdf+json")) {
			serializedGraph = RDFJSON.graphToRdfJson(graph);
		} else if (mediaType.equals(MediaType.APPLICATION_RDF_XML)) {
			serializedGraph = serializeGraph(graph, RDFXMLPrettyWriter.class);
		} else if (mediaType.equals(MediaType.TEXT_RDF_N3)) {
			serializedGraph = serializeGraph(graph, N3Writer.class);
		} else if (mediaType.getName().equals(RDFFormat.TURTLE.getDefaultMIMEType())) {
			serializedGraph = serializeGraph(graph, TurtleWriter.class);
		} else if (mediaType.getName().equals(RDFFormat.TRIX.getDefaultMIMEType())) {
			serializedGraph = serializeGraph(graph, TriXWriter.class);
		} else if (mediaType.getName().equals(RDFFormat.NTRIPLES.getDefaultMIMEType())) {
			serializedGraph = serializeGraph(graph, NTriplesWriter.class);
		} else if (mediaType.getName().equals(RDFFormat.TRIG.getDefaultMIMEType())) {
			serializedGraph = serializeGraph(graph, TriGWriter.class);
		} else if (mediaType.getName().equals(RDFFormat.JSONLD.getDefaultMIMEType())) {
			serializedGraph = serializeGraph(graph, JSONLDWriter.class);
		} else {
			// fallback
			serializedGraph = serializeGraph(graph, TurtleWriter.class);
		}
		return serializedGraph;
	}

	public static JSONObject serializeGraphToJson(Model graph, MediaType rdfFormat) {
		if (rdfFormat == null || MediaType.APPLICATION_JSON.equals(rdfFormat)) {
			// We don't use GraphUtil.serializeGraph() because we need a JSONObject here and
			// converting back and forth between String and JSONObject would not be very efficient
			return RDFJSON.graphToRdfJsonObject(graph);
		} else if (RDFFormat.JSONLD.getDefaultMIMEType().equals(rdfFormat.getName())) {
			return new JSONObject(GraphUtil.serializeGraph(graph, rdfFormat));
		}
		log.warn("Model could not be serialized, returning empty JSON object");
		return new JSONObject();
	}

	public static boolean isSupported(MediaType mediaType) {
		for (MediaType mt : supportedMediaTypes) {
			if (mt.equals(mediaType, false)) {
				return true;
			}
		}
		return false;
	}

	/**
	 * Detects whether an RDF payload can be parsed by RDF4J.
	 *
	 * @param rdf       The RDF to validate.
	 * @param mediaType The media type of the RDF.
	 * @return Returns null if successful or an error message if there was an error when parsing the payload.
	 */
	public static String validateRdf(String rdf, MediaType mediaType) {
		if (!isSupported(mediaType)) {
			return "Unsupported media type: " + mediaType;
		}

		StringReader reader = new StringReader(rdf);
		RDFHandler nullHandler = new URIValidatingRDFHandler();
		RDFParser parser = new RDFXMLParser();
		parser.setParserConfig(constructSafeXmlParserConfig());
		if (mediaType.equals(MediaType.APPLICATION_JSON) || mediaType.getName().equals("application/rdf+json")) {
			// we have special treatment of RDF/JSON here because it does not implement the Parser interface
			Model g = RDFJSON.rdfJsonToGraph(rdf);
			if (g != null) {
				return "There was an error parsing the RDF/JSON payload";
			} else {
				return null;
			}
		} else if (mediaType.equals(MediaType.TEXT_RDF_N3)) {
			parser = new N3ParserFactory().getParser();
		} else if (mediaType.getName().equals(RDFFormat.TURTLE.getDefaultMIMEType())) {
			parser = new TurtleParser();
		} else if (mediaType.getName().equals(RDFFormat.TRIX.getDefaultMIMEType())) {
			parser = new TriXParser();
			parser.setParserConfig(constructSafeXmlParserConfig());
		} else if (mediaType.getName().equals(RDFFormat.NTRIPLES.getDefaultMIMEType())) {
			parser = new NTriplesParser();
		} else if (mediaType.getName().equals(RDFFormat.TRIG.getDefaultMIMEType())) {
			parser = new TriGParser();
		} else if (mediaType.getName().equals(RDFFormat.JSONLD.getDefaultMIMEType())) {
			parser = new JSONLDParser();
		}

		String error = null;
		try {
			parser.setRDFHandler(nullHandler);
			parser.parse(reader, "");
		} catch (RDFHandlerException | RDFParseException | IOException rdfe) {
			error = rdfe.getMessage();
		}

		return error;
	}

	/**
	 * Builds a custom and safe XML parser configuration to prevent XXE attacks. Creates a custom
	 * XML reader to be able to set features that are not supported by the reader which is initialized by Sesame.
	 *
	 * @return Returns a custom XML parser configuration including a custom XML reader.
	 */
	private static ParserConfig constructSafeXmlParserConfig() {
		ParserConfig pc = new ParserConfig();
		pc.set(XMLParserSettings.LOAD_EXTERNAL_DTD, false);
		pc.set(XMLParserSettings.SECURE_PROCESSING, true);

		XMLReader customXmlReader = null;
		try {
			customXmlReader = XMLReaderFactory.createXMLReader();
		} catch (SAXException e) {
			log.error(e.getMessage());
		}

		if (customXmlReader != null) {
			pc.set(XMLParserSettings.CUSTOM_XML_READER, customXmlReader);
			try {
				// Disallow DOCTYPE declaration
				customXmlReader.setFeature("http://apache.org/xml/features/disallow-doctype-decl", true);
			} catch (SAXException se) {
				log.warn(se.getMessage());
			}
			try {
				// External text entities
				customXmlReader.setFeature("http://xml.org/sax/features/external-general-entities", false);
			} catch (SAXException se) {
				log.warn(se.getMessage());
			}
			try {
				// External parameter entities
				customXmlReader.setFeature("http://xml.org/sax/features/external-parameter-entities", false);
			} catch (SAXException se) {
				log.warn(se.getMessage());
			}
			try {
				// Disable external DTDs
				customXmlReader.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", false);
			} catch (SAXException se) {
				log.warn(se.getMessage());
			}
		}

		return pc;
	}

	private static Map<String, String> findNS(Value value) {
		Map<String, String> result = new HashMap<>();
		String dataTypeIri;
		if (value.isLiteral()) {
			// when Value is instance of Literal then .stringValue() returns e.g. "2024-11-18T17:11:59.147+01:00"^^<http://www.w3.org/2001/XMLSchema#dateTime>, so need add .getDataType()
			dataTypeIri = ((Literal) value).getDatatype().stringValue();
		} else if (value.isIRI()) {
			dataTypeIri = value.stringValue();
		} else {
			return result;
		}
		NS.getMap().forEach((prefix, ns) -> {
			if (dataTypeIri.startsWith(ns)) {
				result.put(prefix, ns);
			}
		});
		return result;
	}
}