Drupal: tidy, odstavce, XSLT
authorFrantišek Kučera <franta-hg@frantovo.cz>
Sun Oct 16 22:36:46 2011 +0200 (2011-10-16)
changeset 8221f413541357
parent 81 b51ab80c7a9d
child 83 668014315a54
Drupal: tidy, odstavce, XSLT
src/org/sonews/storage/DrupalMessage.java
     1.1 --- a/src/org/sonews/storage/DrupalMessage.java	Sun Oct 16 20:55:46 2011 +0200
     1.2 +++ b/src/org/sonews/storage/DrupalMessage.java	Sun Oct 16 22:36:46 2011 +0200
     1.3 @@ -91,57 +91,67 @@
     1.4  			Multipart multipart = new MimeMultipart("alternative");
     1.5  			setContent(multipart);
     1.6  
     1.7 +			/** XHTML part */
     1.8 +			MimeBodyPart htmlPart = new MimeBodyPart();
     1.9 +			multipart.addBodyPart(htmlPart);
    1.10 +			String xhtmlText = readXhtmlText(rs);
    1.11 +			htmlPart.setContent(xhtmlText, XHTML_CONTENT_TYPE);
    1.12 +			
    1.13  			/** Plain text part */
    1.14  			MimeBodyPart textPart = new MimeBodyPart();
    1.15  			multipart.addBodyPart(textPart);
    1.16 -			textPart.setText(readPlainText(rs));
    1.17 -
    1.18 -			/** XHTML part */
    1.19 -			MimeBodyPart htmlPart = new MimeBodyPart();
    1.20 -			multipart.addBodyPart(htmlPart);
    1.21 -			htmlPart.setContent(readXhtmlText(rs), XHTML_CONTENT_TYPE);
    1.22 +			textPart.setText(readPlainText(rs, xhtmlText));
    1.23  		} else {
    1.24 +			/** empty body, just headers */
    1.25  			setText("");
    1.26  		}
    1.27  	}
    1.28  
    1.29 -	private String readPlainText(ResultSet rs) {
    1.30 +	private String readPlainText(ResultSet rs, String xhtmlText) {
    1.31  		/**
    1.32  		 * TODO: převést na prostý text
    1.33  		 */
    1.34 -		return "TODO: obyčejný text";
    1.35 +		return "TODO: obyčejný text\n\n\n" + xhtmlText;
    1.36  	}
    1.37  
    1.38  	private String readXhtmlText(ResultSet rs) {
    1.39  		/**
    1.40 -		 * TODO: znovupoužívat XSL transformér
    1.41 +		 * TODO: 
    1.42 +		 *		- znovupoužívat XSL transformér
    1.43 +		 *		- používat cache, ukládat si vygenerované články
    1.44  		 */
    1.45  		try {
    1.46 -			String originalText = rs.getString("text");
    1.47 +			String inputText = "<html><body>" + rs.getString("text") + "</body></html>";
    1.48  
    1.49 -			/**
    1.50 -			 * TODO: používat cache, ukládat si vygenerované články
    1.51 -			 * 
    1.52 -			 * 
    1.53 -			 * Místo markdownu jen ošetřit:
    1.54 -			 *		- odstavce
    1.55 -			 *		- nesmyslné entity v odkazech
    1.56 -			 *		- neuzavřené značky: br, hr, img
    1.57 -			 */
    1.58 -			String tidyTexy = tidyXhtml("<html><body>" + originalText + "</body></html>");
    1.59 +			TransformerFactory tf = TransformerFactory.newInstance();
    1.60 +			Transformer paragraphTransformer = tf.newTransformer(new StreamSource(Resource.getAsStream("helpers/mimeXhtmlPart-make-paragraphs.xsl")));
    1.61  
    1.62 +			String paragraphedText;
    1.63 +			boolean tidyWasUsed = false;
    1.64 +			try {
    1.65 +				StringReader input = new StringReader(inputText);
    1.66 +				StringWriter output = new StringWriter(2 * inputText.length());
    1.67 +				paragraphTransformer.transform(new StreamSource(input), new StreamResult(output));
    1.68 +				paragraphedText = output.toString();
    1.69 +			} catch (Exception e) {
    1.70 +				log.log(Level.FINER, "HTML input was shitty – Tidy had to be called.", e);
    1.71 +				StringReader input = new StringReader(tidyXhtml(inputText));
    1.72 +				StringWriter output = new StringWriter(2 * inputText.length());
    1.73 +				paragraphTransformer.transform(new StreamSource(input), new StreamResult(output));
    1.74 +				paragraphedText = output.toString();
    1.75 +				tidyWasUsed = true;
    1.76 +			}
    1.77  
    1.78 -
    1.79 -			StringReader input = new StringReader(tidyTexy);
    1.80 -			StringWriter output = new StringWriter(2 * tidyTexy.length());
    1.81 -			TransformerFactory tf = TransformerFactory.newInstance();
    1.82 -			Transformer t = tf.newTransformer(new StreamSource(Resource.getAsStream("helpers/mimeXhtmlPart.xsl")));
    1.83 -			t.setParameter("isRoot", (rs.getInt("parent_id") == 0));
    1.84 -			t.setParameter("title", rs.getString("subject"));
    1.85 -			t.setParameter("urlBase", rs.getString("urlBase"));
    1.86 -			t.setParameter("wwwRead", rs.getString("wwwRead"));
    1.87 -			t.setParameter("wwwPost", rs.getString("wwwPost"));
    1.88 -			t.transform(new StreamSource(input), new StreamResult(output));
    1.89 +			Transformer xhtmlTransformer = tf.newTransformer(new StreamSource(Resource.getAsStream("helpers/mimeXhtmlPart.xsl")));
    1.90 +			xhtmlTransformer.setParameter("isRoot", (rs.getInt("parent_id") == 0));
    1.91 +			xhtmlTransformer.setParameter("title", rs.getString("subject"));
    1.92 +			xhtmlTransformer.setParameter("urlBase", rs.getString("urlBase"));
    1.93 +			xhtmlTransformer.setParameter("wwwRead", rs.getString("wwwRead"));
    1.94 +			xhtmlTransformer.setParameter("wwwPost", rs.getString("wwwPost"));
    1.95 +			xhtmlTransformer.setParameter("headComment", String.format("Drupal-NNTP bridge. Transformed: %1$tc. Tidy had to be used: %2$b", new Date(), tidyWasUsed));
    1.96 +			StringReader input = new StringReader(paragraphedText);
    1.97 +			StringWriter output = new StringWriter(2 * paragraphedText.length());
    1.98 +			xhtmlTransformer.transform(new StreamSource(input), new StreamResult(output));
    1.99  
   1.100  			return output.toString();
   1.101  		} catch (Exception e) {
   1.102 @@ -157,15 +167,21 @@
   1.103  	 * TODO: refaktorovat, přesunout
   1.104  	 */
   1.105  	private static String tidyXhtml(String inputText) throws IOException {
   1.106 +		// https://sourceforge.net/tracker/index.php?func=detail&aid=3424437&group_id=27659&atid=390966
   1.107 +		inputText = inputText.replaceAll("\\n", "◆\n");
   1.108 +
   1.109  		Runtime r = Runtime.getRuntime();
   1.110 -		Process p = r.exec(new String[]{"tidy",
   1.111 -					"-asxml",
   1.112 -					"-numeric",
   1.113 -					"-utf8",
   1.114 -					"-quiet",
   1.115 -					"--doctype", "omit",
   1.116 -					"--logical-emphasis", "true",
   1.117 -					"--show-errors", "0"});
   1.118 +		Process p = r.exec(new String[]{"tidy", // http://tidy.sourceforge.net
   1.119 +					"-asxml", // well formed XHTML
   1.120 +					"-numeric", // číselné entity
   1.121 +					"-utf8", // kódování
   1.122 +					"--show-warnings", "false", // žádná varování nás nezajímají
   1.123 +					"--show-errors", "0", // ani chyby
   1.124 +					"--doctype", "omit", // doctype nepotřebujeme (doplníme si případně vlastní v XSLT)
   1.125 +					"--logical-emphasis", "true", // em a strong místo i a b
   1.126 +					"--literal-attributes", "true", // zachovat mezery a konce řádků v atributech
   1.127 +					"--force-output", "true" // neznámé značky zahodíme, vložíme jen jejich obsah
   1.128 +				});
   1.129  
   1.130  		PrintStream vstupProcesu = new PrintStream(p.getOutputStream());
   1.131  		vstupProcesu.print(inputText);
   1.132 @@ -173,6 +189,9 @@
   1.133  
   1.134  		String outputText = streamToString(p.getInputStream());
   1.135  
   1.136 +		outputText = outputText.replaceAll("◆\\n", "\n");
   1.137 +		outputText = outputText.replaceAll("◆", "\n");
   1.138 +
   1.139  		return outputText;
   1.140  	}
   1.141