From 786b43efc893ceb3fcf3d1d6acb9f46fc4b5426f Mon Sep 17 00:00:00 2001 From: Simon Gaeremynck Date: Mon, 18 May 2015 16:24:41 +0100 Subject: [PATCH 1/3] Tidy HTML before trying to convert it with abiword --- settings.json.template | 42 +++++++++++++++++-------------- src/node/handler/ExportHandler.js | 36 +++++++++++++++----------- src/node/utils/Settings.js | 9 +++++-- src/node/utils/TidyHtml.js | 35 ++++++++++++++++++++++++++ 4 files changed, 87 insertions(+), 35 deletions(-) create mode 100644 src/node/utils/TidyHtml.js diff --git a/settings.json.template b/settings.json.template index 7d9c62cc..310e0791 100644 --- a/settings.json.template +++ b/settings.json.template @@ -10,12 +10,12 @@ // favicon default name // alternatively, set up a fully specified Url to your own favicon "favicon": "favicon.ico", - + //IP and port which etherpad should bind at "ip": "0.0.0.0", "port" : 9001, - /* + /* // Node native SSL support // this is disabled by default // @@ -37,17 +37,17 @@ "dbSettings" : { "filename" : "var/dirty.db" }, - + /* An Example of MySQL Configuration "dbType" : "mysql", "dbSettings" : { - "user" : "root", - "host" : "localhost", - "password": "", + "user" : "root", + "host" : "localhost", + "password": "", "database": "store" }, */ - + //the default text of a pad "defaultPadText" : "Welcome to Etherpad!\n\nThis pad text is synchronized as you type, so that everyone viewing this page sees the same text. This allows you to collaborate seamlessly on documents!\n\nGet involved with Etherpad at http:\/\/etherpad.org\n", @@ -65,7 +65,7 @@ "chatAndUsers": false, "lang": "en-gb" }, - + /* Shoud we suppress errors from being visible in the default Pad Text? */ "suppressErrorsInPadText" : false, @@ -77,35 +77,39 @@ /* Users, who have a valid session, automatically get granted access to password protected pads */ "sessionNoPassword" : false, - - /* if true, all css & js will be minified before sending to the client. This will improve the loading performance massivly, + + /* if true, all css & js will be minified before sending to the client. This will improve the loading performance massivly, but makes it impossible to debug the javascript/css */ "minify" : true, /* How long may clients use served javascript code (in seconds)? Without versioning this may cause problems during deployment. Set to 0 to disable caching */ "maxAge" : 21600, // 60 * 60 * 6 = 6 hours - + /* This is the path to the Abiword executable. Setting it to null, disables abiword. - Abiword is needed to advanced import/export features of pads*/ + Abiword is needed to advanced import/export features of pads*/ "abiword" : null, + /* This is the path to the Tidy executable. Setting it to null, disables Tidy. + Tidy is used to improve the quality of exported pads*/ + "tidyHtml" : null, + /* Allow import of file types other than the supported types: txt, doc, docx, rtf, odt, html & htm */ "allowUnknownFileEnds" : true, - + /* This setting is used if you require authentication of all users. Note: /admin always requires authentication. */ "requireAuthentication" : false, /* Require authorization by a module, or a user with is_admin set, see below. */ "requireAuthorization" : false, - + /*when you use NginX or another proxy/ load-balancer set this to true*/ "trustProxy" : false, - + /* Privacy: disable IP logging */ - "disableIPlogging" : false, - + "disableIPlogging" : false, + /* Users for basic authentication. is_admin = true gives access to /admin. If you do not uncomment this, /admin will not be available! */ /* @@ -126,7 +130,7 @@ // Allow Load Testing tools to hit the Etherpad Instance. Warning this will disable security on the instance. "loadTest": false, - + /* The toolbar buttons configuration. "toolbar": { "left": [ @@ -148,7 +152,7 @@ /* The log level we are using, can be: DEBUG, INFO, WARN, ERROR */ "loglevel": "INFO", - + //Logging configuration. See log4js documentation for further information // https://github.com/nomiddlename/log4js-node // You can add as many appenders as you want here: diff --git a/src/node/handler/ExportHandler.js b/src/node/handler/ExportHandler.js index f20e8715..f861c82e 100644 --- a/src/node/handler/ExportHandler.js +++ b/src/node/handler/ExportHandler.js @@ -28,6 +28,7 @@ var fs = require("fs"); var settings = require('../utils/Settings'); var os = require('os'); var hooks = require("ep_etherpad-lite/static/js/pluginfw/hooks"); +var TidyHtml = require('../utils/TidyHtml'); //load abiword only if its enabled if(settings.abiword != null) @@ -35,28 +36,28 @@ if(settings.abiword != null) var tempDirectory = "/tmp"; -//tempDirectory changes if the operating system is windows +//tempDirectory changes if the operating system is windows if(os.type().indexOf("Windows") > -1) { tempDirectory = process.env.TEMP; } - + /** * do a requested export - */ + */ exports.doExport = function(req, res, padId, type) { var fileName = padId; // allow fileName to be overwritten by a hook, the type type is kept static for security reasons - hooks.aCallFirst("exportFileName", padId, + hooks.aCallFirst("exportFileName", padId, function(err, hookFileName){ // if fileName is set then set it to the padId, note that fileName is returned as an array. if(hookFileName.length) fileName = hookFileName; //tell the browser that this is a downloadable file res.attachment(fileName + "." + type); - + //if this is a plain text export, we can do this directly // We have to over engineer this because tabs are stored as attributes and not plain text if(type == "etherpad"){ @@ -72,7 +73,7 @@ exports.doExport = function(req, res, padId, type) var txt; var randNum; var srcFile, destFile; - + async.series([ //render the txt document function(callback) @@ -96,7 +97,7 @@ exports.doExport = function(req, res, padId, type) { //ensure html can be collected by the garbage collector txt = null; - + destFile = tempDirectory + "/etherpad_export_" + randNum + "." + type; abiword.convertFile(srcFile, destFile, type, callback); }, @@ -140,7 +141,7 @@ exports.doExport = function(req, res, padId, type) var html; var randNum; var srcFile, destFile; - + async.series([ //render the html document function(callback) @@ -150,7 +151,7 @@ exports.doExport = function(req, res, padId, type) if(ERR(err, callback)) return; html = _html; callback(); - }); + }); }, //decide what to do with the html export function(callback) @@ -162,22 +163,29 @@ exports.doExport = function(req, res, padId, type) hooks.aCallFirst("exportHTMLSend", html, function(err, newHTML){ if(newHTML.length) html = newHTML; res.send(html); - callback("stop"); + callback("stop"); }); } else //write the html export to a file { randNum = Math.floor(Math.random()*0xFFFFFFFF); srcFile = tempDirectory + "/etherpad_export_" + randNum + ".html"; - fs.writeFile(srcFile, html, callback); + fs.writeFile(srcFile, html, callback); } }, - //send the convert job to abiword + + // Tidy up the exported HTML function(callback) { //ensure html can be collected by the garbage collector html = null; - + + TidyHtml.tidy(srcFile, callback); + }, + + //send the convert job to abiword + function(callback) + { destFile = tempDirectory + "/etherpad_export_" + randNum + "." + type; abiword.convertFile(srcFile, destFile, type, callback); }, @@ -199,7 +207,7 @@ exports.doExport = function(req, res, padId, type) //100ms delay to accomidate for slow windows fs if(os.type().indexOf("Windows") > -1) { - setTimeout(function() + setTimeout(function() { fs.unlink(destFile, callback); }, 100); diff --git a/src/node/utils/Settings.js b/src/node/utils/Settings.js index b7d1f0bc..2cc6a926 100644 --- a/src/node/utils/Settings.js +++ b/src/node/utils/Settings.js @@ -152,6 +152,11 @@ exports.minify = true; */ exports.abiword = null; +/** + * The path of the tidy executable + */ +exports.tidyHtml = null; + /** * Should we support none natively supported file types on import? */ @@ -167,7 +172,7 @@ exports.loglevel = "INFO"; */ exports.disableIPlogging = false; -/** +/** * Disable Load Testing */ exports.loadTest = false; @@ -239,7 +244,7 @@ exports.reloadSettings = function reloadSettings() { } else { settingsFilename = path.resolve(path.join(exports.root, settingsFilename)); } - + var settingsStr; try{ //read the settings sync diff --git a/src/node/utils/TidyHtml.js b/src/node/utils/TidyHtml.js new file mode 100644 index 00000000..13dc2ece --- /dev/null +++ b/src/node/utils/TidyHtml.js @@ -0,0 +1,35 @@ +/** + * Tidy up the HTML in a given file + */ + +var settings = require("./Settings"); +var spawn = require('child_process').spawn; + +exports.tidy = function(srcFile, callback) { + // Don't do anything if Tidy hasn't been enabled + if (!settings.tidyHtml) { + return callback(null); + } + + var errMessage = ''; + + // Spawn a new tidy instance that cleans up the file inline + var tidy = spawn(settings.tidyHtml, ['-modify', srcFile]); + + // Keep track of any error messages + tidy.stderr.on('data', function (data) { + errMessage += data.toString(); + }); + + // Wait until Tidy is done + tidy.on('close', function(code) { + // Tidy returns a 0 when no errors occur and a 1 exit code when + // the file could be tidied but a few warnings were generated + if (code === 0 || code === 1) { + return callback(null); + } else { + console.error(errMessage); + return callback('Tidy died with exit code ' + code); + } + }); +}; From 7fe99cccad64707a46d3120d6a18f2e6d0724b90 Mon Sep 17 00:00:00 2001 From: Simon Gaeremynck Date: Mon, 18 May 2015 17:43:46 +0100 Subject: [PATCH 2/3] Using log4js in TidyHtml --- src/node/utils/TidyHtml.js | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/src/node/utils/TidyHtml.js b/src/node/utils/TidyHtml.js index 13dc2ece..5d4e6ed7 100644 --- a/src/node/utils/TidyHtml.js +++ b/src/node/utils/TidyHtml.js @@ -2,18 +2,23 @@ * Tidy up the HTML in a given file */ -var settings = require("./Settings"); +var log4js = require('log4js'); +var settings = require('./Settings'); var spawn = require('child_process').spawn; exports.tidy = function(srcFile, callback) { + var logger = log4js.getLogger('TidyHtml'); + // Don't do anything if Tidy hasn't been enabled if (!settings.tidyHtml) { + logger.debug('tidyHtml has not been configured yet, ignoring tidy request'); return callback(null); } var errMessage = ''; // Spawn a new tidy instance that cleans up the file inline + logger.debug('Tidying ' + srcFile); var tidy = spawn(settings.tidyHtml, ['-modify', srcFile]); // Keep track of any error messages @@ -26,9 +31,10 @@ exports.tidy = function(srcFile, callback) { // Tidy returns a 0 when no errors occur and a 1 exit code when // the file could be tidied but a few warnings were generated if (code === 0 || code === 1) { + logger.debug('Tidied ' + srcFile + ' successfully'); return callback(null); } else { - console.error(errMessage); + logger.error('Failed to tidy ' + srcFile + '\n' + errMessage); return callback('Tidy died with exit code ' + code); } }); From fd9d0bc291cbfca042f818d76541464c170dc130 Mon Sep 17 00:00:00 2001 From: Simon Gaeremynck Date: Mon, 18 May 2015 17:44:11 +0100 Subject: [PATCH 3/3] Added backend tests for TidyHtml --- tests/backend/specs/api/tidy.js | 63 +++++++++++++++++++++++++++++++++ 1 file changed, 63 insertions(+) create mode 100644 tests/backend/specs/api/tidy.js diff --git a/tests/backend/specs/api/tidy.js b/tests/backend/specs/api/tidy.js new file mode 100644 index 00000000..47cb49f6 --- /dev/null +++ b/tests/backend/specs/api/tidy.js @@ -0,0 +1,63 @@ +var assert = require('assert') + fs = require('fs'), + path = require('path'), + TidyHtml = null, + Settings = null; + +var npm = require("../../../../src/node_modules/npm/lib/npm.js"); + +describe('tidyHtml', function() { + before(function(done) { + npm.load({}, function(err) { + assert.ok(!err); + TidyHtml = require('../../../../src/node/utils/TidyHtml'); + Settings = require('../../../../src/node/utils/Settings'); + return done() + }); + }); + + it('Tidies HTML', function(done) { + // If the user hasn't configured Tidy, we skip this tests as it's required for this test + if (!Settings.tidyHtml) { + this.skip(); + } + + // Try to tidy up a bad HTML file + var tmpDir = process.env.TEMP || "/tmp"; + var tmpFile = path.join(tmpDir, 'tmp_' + (Math.floor(Math.random() * 1000000)) + '.html') + fs.writeFileSync(tmpFile, '

a paragraph

  • List without outer UL
  • trailing closing p

    '); + TidyHtml.tidy(tmpFile, function(err){ + assert.ok(!err); + + // Read the file again + var cleanedHtml = fs.readFileSync(tmpFile).toString(); + + var expectedHtml = [ + '', + '', + '', + '

    a paragraph

    ', + '
      ', + '
    • List without outer UL
    • ', + '
    • trailing closing p
    • ', + '
    ', + '', + '', + ].join('\n'); + assert.notStrictEqual(cleanedHtml.indexOf(expectedHtml), -1); + return done(); + }); + }); + + it('can deal with errors', function(done) { + // If the user hasn't configured Tidy, we skip this tests as it's required for this test + if (!Settings.tidyHtml) { + this.skip(); + } + + TidyHtml.tidy('/some/none/existing/file.html', function(err) { + assert.ok(err); + return done(); + }); + }); +});