Merge pull request #2668 from simong/tidy

Tidy HTML before trying to convert it with abiword
This commit is contained in:
John McLear 2015-05-18 20:04:15 +01:00
commit 5615bab0d9
5 changed files with 156 additions and 35 deletions

View File

@ -10,12 +10,12 @@
// favicon default name // favicon default name
// alternatively, set up a fully specified Url to your own favicon // alternatively, set up a fully specified Url to your own favicon
"favicon": "favicon.ico", "favicon": "favicon.ico",
//IP and port which etherpad should bind at //IP and port which etherpad should bind at
"ip": "0.0.0.0", "ip": "0.0.0.0",
"port" : 9001, "port" : 9001,
/* /*
// Node native SSL support // Node native SSL support
// this is disabled by default // this is disabled by default
// //
@ -37,17 +37,17 @@
"dbSettings" : { "dbSettings" : {
"filename" : "var/dirty.db" "filename" : "var/dirty.db"
}, },
/* An Example of MySQL Configuration /* An Example of MySQL Configuration
"dbType" : "mysql", "dbType" : "mysql",
"dbSettings" : { "dbSettings" : {
"user" : "root", "user" : "root",
"host" : "localhost", "host" : "localhost",
"password": "", "password": "",
"database": "store" "database": "store"
}, },
*/ */
//the default text of a pad //the default text of a pad
"defaultPadText" : "Welcome to Etherpad!\n\nThis pad text is synchronized as you type, so that everyone viewing this page sees the same text. This allows you to collaborate seamlessly on documents!\n\nGet involved with Etherpad at http:\/\/etherpad.org\n", "defaultPadText" : "Welcome to Etherpad!\n\nThis pad text is synchronized as you type, so that everyone viewing this page sees the same text. This allows you to collaborate seamlessly on documents!\n\nGet involved with Etherpad at http:\/\/etherpad.org\n",
@ -65,7 +65,7 @@
"chatAndUsers": false, "chatAndUsers": false,
"lang": "en-gb" "lang": "en-gb"
}, },
/* Shoud we suppress errors from being visible in the default Pad Text? */ /* Shoud we suppress errors from being visible in the default Pad Text? */
"suppressErrorsInPadText" : false, "suppressErrorsInPadText" : false,
@ -77,35 +77,39 @@
/* Users, who have a valid session, automatically get granted access to password protected pads */ /* Users, who have a valid session, automatically get granted access to password protected pads */
"sessionNoPassword" : false, "sessionNoPassword" : false,
/* if true, all css & js will be minified before sending to the client. This will improve the loading performance massivly, /* if true, all css & js will be minified before sending to the client. This will improve the loading performance massivly,
but makes it impossible to debug the javascript/css */ but makes it impossible to debug the javascript/css */
"minify" : true, "minify" : true,
/* How long may clients use served javascript code (in seconds)? Without versioning this /* How long may clients use served javascript code (in seconds)? Without versioning this
may cause problems during deployment. Set to 0 to disable caching */ may cause problems during deployment. Set to 0 to disable caching */
"maxAge" : 21600, // 60 * 60 * 6 = 6 hours "maxAge" : 21600, // 60 * 60 * 6 = 6 hours
/* This is the path to the Abiword executable. Setting it to null, disables abiword. /* This is the path to the Abiword executable. Setting it to null, disables abiword.
Abiword is needed to advanced import/export features of pads*/ Abiword is needed to advanced import/export features of pads*/
"abiword" : null, "abiword" : null,
/* This is the path to the Tidy executable. Setting it to null, disables Tidy.
Tidy is used to improve the quality of exported pads*/
"tidyHtml" : null,
/* Allow import of file types other than the supported types: txt, doc, docx, rtf, odt, html & htm */ /* Allow import of file types other than the supported types: txt, doc, docx, rtf, odt, html & htm */
"allowUnknownFileEnds" : true, "allowUnknownFileEnds" : true,
/* This setting is used if you require authentication of all users. /* This setting is used if you require authentication of all users.
Note: /admin always requires authentication. */ Note: /admin always requires authentication. */
"requireAuthentication" : false, "requireAuthentication" : false,
/* Require authorization by a module, or a user with is_admin set, see below. */ /* Require authorization by a module, or a user with is_admin set, see below. */
"requireAuthorization" : false, "requireAuthorization" : false,
/*when you use NginX or another proxy/ load-balancer set this to true*/ /*when you use NginX or another proxy/ load-balancer set this to true*/
"trustProxy" : false, "trustProxy" : false,
/* Privacy: disable IP logging */ /* Privacy: disable IP logging */
"disableIPlogging" : false, "disableIPlogging" : false,
/* Users for basic authentication. is_admin = true gives access to /admin. /* Users for basic authentication. is_admin = true gives access to /admin.
If you do not uncomment this, /admin will not be available! */ If you do not uncomment this, /admin will not be available! */
/* /*
@ -126,7 +130,7 @@
// Allow Load Testing tools to hit the Etherpad Instance. Warning this will disable security on the instance. // Allow Load Testing tools to hit the Etherpad Instance. Warning this will disable security on the instance.
"loadTest": false, "loadTest": false,
/* The toolbar buttons configuration. /* The toolbar buttons configuration.
"toolbar": { "toolbar": {
"left": [ "left": [
@ -148,7 +152,7 @@
/* The log level we are using, can be: DEBUG, INFO, WARN, ERROR */ /* The log level we are using, can be: DEBUG, INFO, WARN, ERROR */
"loglevel": "INFO", "loglevel": "INFO",
//Logging configuration. See log4js documentation for further information //Logging configuration. See log4js documentation for further information
// https://github.com/nomiddlename/log4js-node // https://github.com/nomiddlename/log4js-node
// You can add as many appenders as you want here: // You can add as many appenders as you want here:

View File

@ -28,6 +28,7 @@ var fs = require("fs");
var settings = require('../utils/Settings'); var settings = require('../utils/Settings');
var os = require('os'); var os = require('os');
var hooks = require("ep_etherpad-lite/static/js/pluginfw/hooks"); var hooks = require("ep_etherpad-lite/static/js/pluginfw/hooks");
var TidyHtml = require('../utils/TidyHtml');
//load abiword only if its enabled //load abiword only if its enabled
if(settings.abiword != null) if(settings.abiword != null)
@ -35,28 +36,28 @@ if(settings.abiword != null)
var tempDirectory = "/tmp"; var tempDirectory = "/tmp";
//tempDirectory changes if the operating system is windows //tempDirectory changes if the operating system is windows
if(os.type().indexOf("Windows") > -1) if(os.type().indexOf("Windows") > -1)
{ {
tempDirectory = process.env.TEMP; tempDirectory = process.env.TEMP;
} }
/** /**
* do a requested export * do a requested export
*/ */
exports.doExport = function(req, res, padId, type) exports.doExport = function(req, res, padId, type)
{ {
var fileName = padId; var fileName = padId;
// allow fileName to be overwritten by a hook, the type type is kept static for security reasons // allow fileName to be overwritten by a hook, the type type is kept static for security reasons
hooks.aCallFirst("exportFileName", padId, hooks.aCallFirst("exportFileName", padId,
function(err, hookFileName){ function(err, hookFileName){
// if fileName is set then set it to the padId, note that fileName is returned as an array. // if fileName is set then set it to the padId, note that fileName is returned as an array.
if(hookFileName.length) fileName = hookFileName; if(hookFileName.length) fileName = hookFileName;
//tell the browser that this is a downloadable file //tell the browser that this is a downloadable file
res.attachment(fileName + "." + type); res.attachment(fileName + "." + type);
//if this is a plain text export, we can do this directly //if this is a plain text export, we can do this directly
// We have to over engineer this because tabs are stored as attributes and not plain text // We have to over engineer this because tabs are stored as attributes and not plain text
if(type == "etherpad"){ if(type == "etherpad"){
@ -72,7 +73,7 @@ exports.doExport = function(req, res, padId, type)
var txt; var txt;
var randNum; var randNum;
var srcFile, destFile; var srcFile, destFile;
async.series([ async.series([
//render the txt document //render the txt document
function(callback) function(callback)
@ -96,7 +97,7 @@ exports.doExport = function(req, res, padId, type)
{ {
//ensure html can be collected by the garbage collector //ensure html can be collected by the garbage collector
txt = null; txt = null;
destFile = tempDirectory + "/etherpad_export_" + randNum + "." + type; destFile = tempDirectory + "/etherpad_export_" + randNum + "." + type;
abiword.convertFile(srcFile, destFile, type, callback); abiword.convertFile(srcFile, destFile, type, callback);
}, },
@ -140,7 +141,7 @@ exports.doExport = function(req, res, padId, type)
var html; var html;
var randNum; var randNum;
var srcFile, destFile; var srcFile, destFile;
async.series([ async.series([
//render the html document //render the html document
function(callback) function(callback)
@ -150,7 +151,7 @@ exports.doExport = function(req, res, padId, type)
if(ERR(err, callback)) return; if(ERR(err, callback)) return;
html = _html; html = _html;
callback(); callback();
}); });
}, },
//decide what to do with the html export //decide what to do with the html export
function(callback) function(callback)
@ -162,22 +163,29 @@ exports.doExport = function(req, res, padId, type)
hooks.aCallFirst("exportHTMLSend", html, function(err, newHTML){ hooks.aCallFirst("exportHTMLSend", html, function(err, newHTML){
if(newHTML.length) html = newHTML; if(newHTML.length) html = newHTML;
res.send(html); res.send(html);
callback("stop"); callback("stop");
}); });
} }
else //write the html export to a file else //write the html export to a file
{ {
randNum = Math.floor(Math.random()*0xFFFFFFFF); randNum = Math.floor(Math.random()*0xFFFFFFFF);
srcFile = tempDirectory + "/etherpad_export_" + randNum + ".html"; srcFile = tempDirectory + "/etherpad_export_" + randNum + ".html";
fs.writeFile(srcFile, html, callback); fs.writeFile(srcFile, html, callback);
} }
}, },
//send the convert job to abiword
// Tidy up the exported HTML
function(callback) function(callback)
{ {
//ensure html can be collected by the garbage collector //ensure html can be collected by the garbage collector
html = null; html = null;
TidyHtml.tidy(srcFile, callback);
},
//send the convert job to abiword
function(callback)
{
destFile = tempDirectory + "/etherpad_export_" + randNum + "." + type; destFile = tempDirectory + "/etherpad_export_" + randNum + "." + type;
abiword.convertFile(srcFile, destFile, type, callback); abiword.convertFile(srcFile, destFile, type, callback);
}, },
@ -199,7 +207,7 @@ exports.doExport = function(req, res, padId, type)
//100ms delay to accomidate for slow windows fs //100ms delay to accomidate for slow windows fs
if(os.type().indexOf("Windows") > -1) if(os.type().indexOf("Windows") > -1)
{ {
setTimeout(function() setTimeout(function()
{ {
fs.unlink(destFile, callback); fs.unlink(destFile, callback);
}, 100); }, 100);

View File

@ -152,6 +152,11 @@ exports.minify = true;
*/ */
exports.abiword = null; exports.abiword = null;
/**
* The path of the tidy executable
*/
exports.tidyHtml = null;
/** /**
* Should we support none natively supported file types on import? * Should we support none natively supported file types on import?
*/ */
@ -167,7 +172,7 @@ exports.loglevel = "INFO";
*/ */
exports.disableIPlogging = false; exports.disableIPlogging = false;
/** /**
* Disable Load Testing * Disable Load Testing
*/ */
exports.loadTest = false; exports.loadTest = false;
@ -239,7 +244,7 @@ exports.reloadSettings = function reloadSettings() {
} else { } else {
settingsFilename = path.resolve(path.join(exports.root, settingsFilename)); settingsFilename = path.resolve(path.join(exports.root, settingsFilename));
} }
var settingsStr; var settingsStr;
try{ try{
//read the settings sync //read the settings sync

View File

@ -0,0 +1,41 @@
/**
* Tidy up the HTML in a given file
*/
var log4js = require('log4js');
var settings = require('./Settings');
var spawn = require('child_process').spawn;
exports.tidy = function(srcFile, callback) {
var logger = log4js.getLogger('TidyHtml');
// Don't do anything if Tidy hasn't been enabled
if (!settings.tidyHtml) {
logger.debug('tidyHtml has not been configured yet, ignoring tidy request');
return callback(null);
}
var errMessage = '';
// Spawn a new tidy instance that cleans up the file inline
logger.debug('Tidying ' + srcFile);
var tidy = spawn(settings.tidyHtml, ['-modify', srcFile]);
// Keep track of any error messages
tidy.stderr.on('data', function (data) {
errMessage += data.toString();
});
// Wait until Tidy is done
tidy.on('close', function(code) {
// Tidy returns a 0 when no errors occur and a 1 exit code when
// the file could be tidied but a few warnings were generated
if (code === 0 || code === 1) {
logger.debug('Tidied ' + srcFile + ' successfully');
return callback(null);
} else {
logger.error('Failed to tidy ' + srcFile + '\n' + errMessage);
return callback('Tidy died with exit code ' + code);
}
});
};

View File

@ -0,0 +1,63 @@
var assert = require('assert')
fs = require('fs'),
path = require('path'),
TidyHtml = null,
Settings = null;
var npm = require("../../../../src/node_modules/npm/lib/npm.js");
describe('tidyHtml', function() {
before(function(done) {
npm.load({}, function(err) {
assert.ok(!err);
TidyHtml = require('../../../../src/node/utils/TidyHtml');
Settings = require('../../../../src/node/utils/Settings');
return done()
});
});
it('Tidies HTML', function(done) {
// If the user hasn't configured Tidy, we skip this tests as it's required for this test
if (!Settings.tidyHtml) {
this.skip();
}
// Try to tidy up a bad HTML file
var tmpDir = process.env.TEMP || "/tmp";
var tmpFile = path.join(tmpDir, 'tmp_' + (Math.floor(Math.random() * 1000000)) + '.html')
fs.writeFileSync(tmpFile, '<html><body><p>a paragraph</p><li>List without outer UL</li>trailing closing p</p></body></html>');
TidyHtml.tidy(tmpFile, function(err){
assert.ok(!err);
// Read the file again
var cleanedHtml = fs.readFileSync(tmpFile).toString();
var expectedHtml = [
'<title></title>',
'</head>',
'<body>',
'<p>a paragraph</p>',
'<ul>',
'<li>List without outer UL</li>',
'<li style="list-style: none">trailing closing p</li>',
'</ul>',
'</body>',
'</html>',
].join('\n');
assert.notStrictEqual(cleanedHtml.indexOf(expectedHtml), -1);
return done();
});
});
it('can deal with errors', function(done) {
// If the user hasn't configured Tidy, we skip this tests as it's required for this test
if (!Settings.tidyHtml) {
this.skip();
}
TidyHtml.tidy('/some/none/existing/file.html', function(err) {
assert.ok(err);
return done();
});
});
});