# HG changeset patch
# User František Kučera <franta-hg@frantovo.cz>
# Date 1318468162 -7200
# Node ID 41d6c0cac8b34dcb79b5f6158127b1876d94fb0d
# Parent  e1244384cc6f14a71de0e6f2a57a62ea79490118
Drupal: číštění HTML – Tidy.

diff -r e1244384cc6f -r 41d6c0cac8b3 src/org/sonews/storage/DrupalMessage.java
--- a/src/org/sonews/storage/DrupalMessage.java	Wed Oct 12 23:10:31 2011 +0200
+++ b/src/org/sonews/storage/DrupalMessage.java	Thu Oct 13 03:09:22 2011 +0200
@@ -17,8 +17,12 @@
  */
 package org.sonews.storage;
 
+import java.io.BufferedReader;
 import java.io.ByteArrayOutputStream;
 import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.io.PrintStream;
 import java.io.StringReader;
 import java.io.StringWriter;
 import java.io.UnsupportedEncodingException;
@@ -114,8 +118,22 @@
 		 */
 		try {
 			String originalText = rs.getString("text");
-			StringReader input = new StringReader("<body>" + originalText + "</body>");
-			StringWriter output = new StringWriter();
+
+			/**
+			 * TODO: používat cache, ukládat si vygenerované články
+			 * 
+			 * 
+			 * Místo markdownu jen ošetřit:
+			 *		- odstavce
+			 *		- nesmyslné entity v odkazech
+			 *		- neuzavřené značky: br, hr, img
+			 */
+			String tidyTexy = tidyXhtml("<html><body>" + originalText + "</body></html>");
+
+
+
+			StringReader input = new StringReader(tidyTexy);
+			StringWriter output = new StringWriter(2 * tidyTexy.length());
 			TransformerFactory tf = TransformerFactory.newInstance();
 			Transformer t = tf.newTransformer(new StreamSource(Resource.getAsStream("helpers/mimeXhtmlPart.xsl")));
 			t.setParameter("isRoot", (rs.getInt("parent_id") == 0));
@@ -124,7 +142,7 @@
 			t.setParameter("wwwRead", rs.getString("wwwRead"));
 			t.setParameter("wwwPost", rs.getString("wwwPost"));
 			t.transform(new StreamSource(input), new StreamResult(output));
-			
+
 			return output.toString();
 		} catch (Exception e) {
 			/**
@@ -135,6 +153,47 @@
 		}
 	}
 
+	/**
+	 * TODO: refaktorovat, přesunout
+	 */
+	private static String tidyXhtml(String inputText) throws IOException {
+		Runtime r = Runtime.getRuntime();
+		Process p = r.exec(new String[]{"tidy",
+					"-asxml",
+					"-numeric",
+					"-utf8",
+					"-quiet",
+					"--doctype", "omit",
+					"--logical-emphasis", "true",
+					"--show-errors", "0"});
+
+		PrintStream vstupProcesu = new PrintStream(p.getOutputStream());
+		vstupProcesu.print(inputText);
+		vstupProcesu.close();
+
+		String outputText = streamToString(p.getInputStream());
+
+		return outputText;
+	}
+
+	/**
+	 * TODO: refaktorovat, přesunout
+	 */
+	private static String streamToString(InputStream proud) throws IOException {
+		StringBuilder výsledek = new StringBuilder();
+		BufferedReader buf = new BufferedReader(new InputStreamReader(proud));
+		while (true) {
+			String radek = buf.readLine();
+			if (radek == null) {
+				break;
+			} else {
+				výsledek.append(radek);
+				výsledek.append("\n");
+			}
+		}
+		return výsledek.toString();
+	}
+
 	private static String constructMessageId(int articleID, int groupID, String groupName, String domainName) {
 		StringBuilder sb = new StringBuilder();
 		sb.append("<");