# HG changeset patch
# User František Kučera <franta-hg@frantovo.cz>
# Date 1318797406 -7200
# Node ID 21f41354135721e7aa001fcaddb9fec14eae2908
# Parent  b51ab80c7a9d276cf40bac071ef0c59981c9d4f1
Drupal: tidy, odstavce, XSLT

diff -r b51ab80c7a9d -r 21f413541357 src/org/sonews/storage/DrupalMessage.java
--- a/src/org/sonews/storage/DrupalMessage.java	Sun Oct 16 20:55:46 2011 +0200
+++ b/src/org/sonews/storage/DrupalMessage.java	Sun Oct 16 22:36:46 2011 +0200
@@ -91,57 +91,67 @@
 			Multipart multipart = new MimeMultipart("alternative");
 			setContent(multipart);
 
+			/** XHTML part */
+			MimeBodyPart htmlPart = new MimeBodyPart();
+			multipart.addBodyPart(htmlPart);
+			String xhtmlText = readXhtmlText(rs);
+			htmlPart.setContent(xhtmlText, XHTML_CONTENT_TYPE);
+			
 			/** Plain text part */
 			MimeBodyPart textPart = new MimeBodyPart();
 			multipart.addBodyPart(textPart);
-			textPart.setText(readPlainText(rs));
-
-			/** XHTML part */
-			MimeBodyPart htmlPart = new MimeBodyPart();
-			multipart.addBodyPart(htmlPart);
-			htmlPart.setContent(readXhtmlText(rs), XHTML_CONTENT_TYPE);
+			textPart.setText(readPlainText(rs, xhtmlText));
 		} else {
+			/** empty body, just headers */
 			setText("");
 		}
 	}
 
-	private String readPlainText(ResultSet rs) {
+	private String readPlainText(ResultSet rs, String xhtmlText) {
 		/**
 		 * TODO: převést na prostý text
 		 */
-		return "TODO: obyčejný text";
+		return "TODO: obyčejný text\n\n\n" + xhtmlText;
 	}
 
 	private String readXhtmlText(ResultSet rs) {
 		/**
-		 * TODO: znovupoužívat XSL transformér
+		 * TODO: 
+		 *		- znovupoužívat XSL transformér
+		 *		- používat cache, ukládat si vygenerované články
 		 */
 		try {
-			String originalText = rs.getString("text");
+			String inputText = "<html><body>" + rs.getString("text") + "</body></html>";
 
-			/**
-			 * TODO: používat cache, ukládat si vygenerované články
-			 * 
-			 * 
-			 * Místo markdownu jen ošetřit:
-			 *		- odstavce
-			 *		- nesmyslné entity v odkazech
-			 *		- neuzavřené značky: br, hr, img
-			 */
-			String tidyTexy = tidyXhtml("<html><body>" + originalText + "</body></html>");
+			TransformerFactory tf = TransformerFactory.newInstance();
+			Transformer paragraphTransformer = tf.newTransformer(new StreamSource(Resource.getAsStream("helpers/mimeXhtmlPart-make-paragraphs.xsl")));
 
+			String paragraphedText;
+			boolean tidyWasUsed = false;
+			try {
+				StringReader input = new StringReader(inputText);
+				StringWriter output = new StringWriter(2 * inputText.length());
+				paragraphTransformer.transform(new StreamSource(input), new StreamResult(output));
+				paragraphedText = output.toString();
+			} catch (Exception e) {
+				log.log(Level.FINER, "HTML input was shitty – Tidy had to be called.", e);
+				StringReader input = new StringReader(tidyXhtml(inputText));
+				StringWriter output = new StringWriter(2 * inputText.length());
+				paragraphTransformer.transform(new StreamSource(input), new StreamResult(output));
+				paragraphedText = output.toString();
+				tidyWasUsed = true;
+			}
 
-
-			StringReader input = new StringReader(tidyTexy);
-			StringWriter output = new StringWriter(2 * tidyTexy.length());
-			TransformerFactory tf = TransformerFactory.newInstance();
-			Transformer t = tf.newTransformer(new StreamSource(Resource.getAsStream("helpers/mimeXhtmlPart.xsl")));
-			t.setParameter("isRoot", (rs.getInt("parent_id") == 0));
-			t.setParameter("title", rs.getString("subject"));
-			t.setParameter("urlBase", rs.getString("urlBase"));
-			t.setParameter("wwwRead", rs.getString("wwwRead"));
-			t.setParameter("wwwPost", rs.getString("wwwPost"));
-			t.transform(new StreamSource(input), new StreamResult(output));
+			Transformer xhtmlTransformer = tf.newTransformer(new StreamSource(Resource.getAsStream("helpers/mimeXhtmlPart.xsl")));
+			xhtmlTransformer.setParameter("isRoot", (rs.getInt("parent_id") == 0));
+			xhtmlTransformer.setParameter("title", rs.getString("subject"));
+			xhtmlTransformer.setParameter("urlBase", rs.getString("urlBase"));
+			xhtmlTransformer.setParameter("wwwRead", rs.getString("wwwRead"));
+			xhtmlTransformer.setParameter("wwwPost", rs.getString("wwwPost"));
+			xhtmlTransformer.setParameter("headComment", String.format("Drupal-NNTP bridge. Transformed: %1$tc. Tidy had to be used: %2$b", new Date(), tidyWasUsed));
+			StringReader input = new StringReader(paragraphedText);
+			StringWriter output = new StringWriter(2 * paragraphedText.length());
+			xhtmlTransformer.transform(new StreamSource(input), new StreamResult(output));
 
 			return output.toString();
 		} catch (Exception e) {
@@ -157,15 +167,21 @@
 	 * TODO: refaktorovat, přesunout
 	 */
 	private static String tidyXhtml(String inputText) throws IOException {
+		// https://sourceforge.net/tracker/index.php?func=detail&aid=3424437&group_id=27659&atid=390966
+		inputText = inputText.replaceAll("\\n", "◆\n");
+
 		Runtime r = Runtime.getRuntime();
-		Process p = r.exec(new String[]{"tidy",
-					"-asxml",
-					"-numeric",
-					"-utf8",
-					"-quiet",
-					"--doctype", "omit",
-					"--logical-emphasis", "true",
-					"--show-errors", "0"});
+		Process p = r.exec(new String[]{"tidy", // http://tidy.sourceforge.net
+					"-asxml", // well formed XHTML
+					"-numeric", // číselné entity
+					"-utf8", // kódování
+					"--show-warnings", "false", // žádná varování nás nezajímají
+					"--show-errors", "0", // ani chyby
+					"--doctype", "omit", // doctype nepotřebujeme (doplníme si případně vlastní v XSLT)
+					"--logical-emphasis", "true", // em a strong místo i a b
+					"--literal-attributes", "true", // zachovat mezery a konce řádků v atributech
+					"--force-output", "true" // neznámé značky zahodíme, vložíme jen jejich obsah
+				});
 
 		PrintStream vstupProcesu = new PrintStream(p.getOutputStream());
 		vstupProcesu.print(inputText);
@@ -173,6 +189,9 @@
 
 		String outputText = streamToString(p.getInputStream());
 
+		outputText = outputText.replaceAll("◆\\n", "\n");
+		outputText = outputText.replaceAll("◆", "\n");
+
 		return outputText;
 	}