Drupal: číštění HTML – Tidy.
1.1 --- a/src/org/sonews/storage/DrupalMessage.java Wed Oct 12 23:10:31 2011 +0200
1.2 +++ b/src/org/sonews/storage/DrupalMessage.java Thu Oct 13 03:09:22 2011 +0200
1.3 @@ -17,8 +17,12 @@
1.4 */
1.5 package org.sonews.storage;
1.6
1.7 +import java.io.BufferedReader;
1.8 import java.io.ByteArrayOutputStream;
1.9 import java.io.IOException;
1.10 +import java.io.InputStream;
1.11 +import java.io.InputStreamReader;
1.12 +import java.io.PrintStream;
1.13 import java.io.StringReader;
1.14 import java.io.StringWriter;
1.15 import java.io.UnsupportedEncodingException;
1.16 @@ -114,8 +118,22 @@
1.17 */
1.18 try {
1.19 String originalText = rs.getString("text");
1.20 - StringReader input = new StringReader("<body>" + originalText + "</body>");
1.21 - StringWriter output = new StringWriter();
1.22 +
1.23 + /**
1.24 + * TODO: používat cache, ukládat si vygenerované články
1.25 + *
1.26 + *
1.27 + * Místo markdownu jen ošetřit:
1.28 + * - odstavce
1.29 + * - nesmyslné entity v odkazech
1.30 + * - neuzavřené značky: br, hr, img
1.31 + */
1.32 + String tidyTexy = tidyXhtml("<html><body>" + originalText + "</body></html>");
1.33 +
1.34 +
1.35 +
1.36 + StringReader input = new StringReader(tidyTexy);
1.37 + StringWriter output = new StringWriter(2 * tidyTexy.length());
1.38 TransformerFactory tf = TransformerFactory.newInstance();
1.39 Transformer t = tf.newTransformer(new StreamSource(Resource.getAsStream("helpers/mimeXhtmlPart.xsl")));
1.40 t.setParameter("isRoot", (rs.getInt("parent_id") == 0));
1.41 @@ -124,7 +142,7 @@
1.42 t.setParameter("wwwRead", rs.getString("wwwRead"));
1.43 t.setParameter("wwwPost", rs.getString("wwwPost"));
1.44 t.transform(new StreamSource(input), new StreamResult(output));
1.45 -
1.46 +
1.47 return output.toString();
1.48 } catch (Exception e) {
1.49 /**
1.50 @@ -135,6 +153,47 @@
1.51 }
1.52 }
1.53
1.54 + /**
1.55 + * TODO: refaktorovat, přesunout
1.56 + */
1.57 + private static String tidyXhtml(String inputText) throws IOException {
1.58 + Runtime r = Runtime.getRuntime();
1.59 + Process p = r.exec(new String[]{"tidy",
1.60 + "-asxml",
1.61 + "-numeric",
1.62 + "-utf8",
1.63 + "-quiet",
1.64 + "--doctype", "omit",
1.65 + "--logical-emphasis", "true",
1.66 + "--show-errors", "0"});
1.67 +
1.68 + PrintStream vstupProcesu = new PrintStream(p.getOutputStream());
1.69 + vstupProcesu.print(inputText);
1.70 + vstupProcesu.close();
1.71 +
1.72 + String outputText = streamToString(p.getInputStream());
1.73 +
1.74 + return outputText;
1.75 + }
1.76 +
1.77 + /**
1.78 + * TODO: refaktorovat, přesunout
1.79 + */
1.80 + private static String streamToString(InputStream proud) throws IOException {
1.81 + StringBuilder výsledek = new StringBuilder();
1.82 + BufferedReader buf = new BufferedReader(new InputStreamReader(proud));
1.83 + while (true) {
1.84 + String radek = buf.readLine();
1.85 + if (radek == null) {
1.86 + break;
1.87 + } else {
1.88 + výsledek.append(radek);
1.89 + výsledek.append("\n");
1.90 + }
1.91 + }
1.92 + return výsledek.toString();
1.93 + }
1.94 +
1.95 private static String constructMessageId(int articleID, int groupID, String groupName, String domainName) {
1.96 StringBuilder sb = new StringBuilder();
1.97 sb.append("<");