‘use strict’;
/***
Usage: blog2md b|w <BLOGGER/WordPress BACKUP XML>
*/
const fs = require(‘fs’);
const os = require(‘os’);
const path = require(‘path’);
const xml2js = require(‘xml2js’);
const sanitize = require(‘sanitize-filename’);
const TurndownService = require(‘turndown’);
var moment = require(‘moment’);
var tds = new TurndownService({ codeBlockStyle: ‘fenced’, fence: ‘```’ });
tds.addRule(‘wppreblock’, {
filter: [‘pre’],
replacement: function(content) {
return ‘\n' + content + '\n
’;
}
});
// console.log(No. of arguments passed: ${process.argv.length}
);
if (process.argv.length < 5) {
// ${process.argv[1]}
console.log(Usage: blog2md [b|w] <BACKUP XML> <OUTPUT DIR> m|s
);
console.log(\t b for parsing Blogger(Blogspot) backup
);
console.log(\t w for parsing WordPress backup
);
process.exit(1);
}
var option = process.argv[2];
var inputFile = process.argv[3];
var outputDir = process.argv[4];
var mergeComments = (process.argv[5] == ‘m’) ? ‘m’ : ‘s’;
/** Apply a fix to WordPress posts to convert newlines to paragraphs. */
var applyParagraphFix = (process.argv.indexOf(‘paragraph-fix’) >= 0);
if (fs.existsSync(outputDir)) {
console.log(WARNING: Given output directory "${outputDir}" already exists. Files will be overwritten.
);
} else {
fs.mkdirSync(outputDir);
}
if (mergeComments == ‘m’) {
console.log(INFO: Comments requested to be merged along with posts. (m)
);
} else {
console.log(INFO: Comments requested to be a separate .md file(m - default)
);
}
if (option.toLowerCase() == ‘b’) {
bloggerImport(inputFile, outputDir);
} else if (option.toLowerCase() == ‘w’) {
wordpressImport(inputFile, outputDir);
} else {
console.log(‘Only b (Blogger) and w (WordPress) are valid options’);
process.exit(1);
}
function wordpressImport(backupXmlFile, outputDir) {
var parser = new xml2js.Parser();
fs.readFile(backupXmlFile, function(err, data) {
parser.parseString(data, function(err, result) {
if (err) {
console.log(`Error parsing xml file (${backupXmlFile})\n${JSON.stringify(err)}`);
process.exit(1);
}
// console.dir(result);
// console.log(JSON.stringify(result)); return;
var posts = [];
var seoDescription = '';
// try {
posts = result.rss.channel[0].item;
console.log(`Total Post count: ${posts.length}`);
posts = posts.filter(function(post) {
var status = '';
if (post["wp:status"]) {
status = post["wp:status"].join('');
}
// console.log(post["wp:status"].join(''));
return status != "private" && status != "inherit";
});
// console.log(posts)
console.log(`Post count: ${posts.length}`);
var permalink = '';
var title = '';
var content = '';
var description = '';
var author = '';
var category = [];
var link = '';
var attachment = '';
var attachmentUrl = ''
var tags = [];
var draft = false;
var published = '';
var comments = [];
var fname = '';
var markdown = '';
var fileContent = '';
var fileHeader = '';
var postMaps = {};
var seoImage= '';
posts.forEach(function(post) {
var postMap = {};
// Check for different description fields
if (post["wp:postmeta"]) {
try {
post["wp:postmeta"].forEach(function(meta) {
if (meta["wp:meta_key"] && meta["wp:meta_key"][0] === "_yoast_wpseo_metadesc") {
seoDescription = meta["wp:meta_value"][0];
} else if (meta["wp:meta_key"] && meta["wp:meta_key"][0] === "_seopressor_og_image") {
seoImage = extractRelativePermalink(meta["wp:meta_value"][0]);
}
});
} catch (error) {
console.error('Error processing post meta:', error);
}
}
title = post.title[0].trim();
description = post.description[0].trim();
link = extractRelativePermalink(post.link[0]);
var postType = '';
if (post["wp:post_type"]) {
postType = post["wp:post_type"][0];
console.log(`Post type: ${postType}`);
}
// console.log(title);
// if (title && title.indexOf("'")!=-1){
title = title.replace(/'/g, "''");
// }
draft = post["wp:status"] == "draft";
published = post.pubDate;
comments = post['wp:comment'];
fname = sanitize(decodeURI(post["wp:post_name"][0])) || post["wp:post_id"];
markdown = '';
// if (post.guid && post.guid[0] && post.guid[0]['_']){
// fname = path.basename(post.guid[0]['_']);
// }
// console.log(comments);
console.log(`\n\n\n\ntitle: '${title}'`);
console.log(`published: '${published}'`);
console.log(`description: '${seoDescription}'`);
console.log(`attachementUrl: '${attachmentUrl}'`);
console.log(`permalink: '${link}'`);
if (comments) {
console.log(`comments: '${comments.length}'`);
}
var tags = [];
var categories = [];
var tagString = '';
var categoryString = '';
if (post.category && post.category.length) {
post.category.forEach(function(category) {
if (category.$.domain === 'post_tag') {
tags.push(category['_']);
} else if (category.$.domain === 'category') {
categories.push(category['_']);
}
});
}
if (tags.length) {
tagString = 'tags: [\'' + tags.join("', '") + "']\n";
}
if (categories.length) {
categoryString = 'categories: [\'' + categories.join("', '") + "']\n";
}
console.log(tagString);
console.log(categoryString);
var pmap = { fname: '', comments: [] };
pmap.fname = outputDir + '/' + fname + '-comments.md';
fname = outputDir + '/' + fname + '.md';
pmap.postName = fname;
console.log(`fname: '${fname}'`);
if (post["content:encoded"]) {
// console.log('content available');
var postContent = post["content:encoded"].toString();
if (applyParagraphFix && !/<p>/i.test(postContent)) {
postContent = '<p>' + postContent.replace(/(\r?\n){2}/g, '</p>\n\n<p>') + '</p>';
}
content = '<div>' + postContent + '</div>'; //to resolve error if plain text returned
markdown = tds.turndown(content);
// console.log(markdown);
fileHeader = `---\ntitle: "${title}"\ndescription: "${seoDescription}"\ndate: ${published}\ndraft: ${draft}\npost type: ${postType}\nurl: ${link}\nimage: ${seoImage}\n${tagString}\n${categoryString}---\n`;
fileContent = `${fileHeader}\n${markdown}`;
pmap.header = `${fileHeader}\n`;
writeToFile(fname, fileContent);
}
//comments:
/*
"wp:comment" [.each]
wp:comment_author[0]
wp:comment_author_email[0]
wp:comment_author_url[0]
wp:comment_date[0]
wp:comment_content[0]
wp:comment_approved[0] == 1
wp:post_id
*/
var comments = post["wp:comment"] || [];
// console.dir(comments);
var anyApprovedComments = 0;
var ccontent = '';
comments.forEach(function(comment) {
// console.log('')
if (comment["wp:comment_approved"].pop()) {
anyApprovedComments = 1;
var cmt = { title: '', published: '', content: '', author: {} };
cmt.published = (comment["wp:comment_date"] ? comment["wp:comment_date"].pop() : '');
var cont = '<div>' + comment["wp:comment_content"].pop() + '</div>';
cmt.content = (comment["wp:comment_content"] ? tds.turndown(cont) : '');
cmt.author.name = (comment["wp:comment_author"] ? comment["wp:comment_author"].pop() : '');
cmt.author.email = (comment["wp:comment_author_email"] ? comment["wp:comment_author_email"].pop() : '');
cmt.author.url = (comment["wp:comment_author_url"] ? comment["wp:comment_author_url"].pop() : '');
ccontent += `#### [${cmt.author.name}](${cmt.author.url} "${cmt.author.email}") - ${cmt.published}\n\n${cmt.content}\n<hr />\n`;
pmap.comments.push(cmt);
}
});
//just a hack to re-use blogger writecomments method
if (pmap && pmap.comments && pmap.comments.length) {
writeComments({ "0": pmap });
}
});
});
});
}
// Helper function to extract description from encoded content
function getDescriptionFromEncodedContent(content) {
var parser = new xml2js.Parser({ explicitArray: false });
var description = ‘’;
parser.parseString(content, function(err, result) {
if (result && result["wp:meta_value"]) {
description = result["wp:meta_value"];
}
});
return description;
}
// Helper function to find attachment by ID
function findAttachmentById(items, id) {
for (var i = 0; i < items.length; i++) {
if (items[i][“wp:post_type”] && items[i][“wp:post_type”][0] === “attachment” &&
items[i][“wp:post_id”] && items[i][“wp:post_id”][0] === id) {
return items[i];
}
}
return null;
}
// Helper function to extract relative image path
function extractRelativeImagePath(imageUrl) {
var urlObj = new URL(imageUrl);
return urlObj.pathname;
}
// Helper function to extract relative permalink
function extractRelativePermalink(permalink) {
try {
var urlObj = new URL(permalink);
return urlObj.pathname;
} catch (error) {
// Return the original value if it’s not a valid URL
return permalink;
}
}
function getFileName(text) {
var newFileName = sanitize(text) // first remove any dodgy characters
.replace(/[.']/g, ‘’) // then remove some known characters
.replace(/[^a-z0-9]/gi, ‘-’) // then turn anything that isn’t a number or letter into a hyphen
.replace(/[-]{2,}/g, ‘-’) // then turn multiple hyphens into a single one
.toLowerCase(); // finally make it all lower case
return newFileName;
}
function bloggerImport(backupXmlFile, outputDir){
var parser = new xml2js.Parser();
// __dirname + ‘/foo.xml’
fs.readFile(backupXmlFile, function(err, data) {
parser.parseString(data, function (err, result) {
if (err){
console.log(Error parsing xml file (${backupXmlFile})\n${JSON.stringify(err)}
); return 1;
}
// console.dir(JSON.stringify(result)); return;
if(result.feed && result.feed.entry) {
var contents = result.feed.entry;
console.log(`Total no. of entries found : ${contents.length}`);
// var i=0
var posts = contents.filter(function(entry){
return entry.id[0].indexOf('.post-')!=-1 && !entry['thr:in-reply-to']
});
var comments = contents.filter(function(entry){
return entry.id[0].indexOf('.post-')!=-1 && entry['thr:in-reply-to']
});
// console.dir(posts);
console.log(`Content-posts ${posts.length}`);
console.log(`Content-Comments ${comments.length}`);
var content = '';
var markdown = '';
var fileContent = '';
var fileHeader = '';
var postMaps = {};
posts.forEach(function(entry){
var postMap = {};
var title = entry.title[0]['_'];
// title = tds.turndown(title);
if (title && title.indexOf("'")!=-1){
title = title.replace(/'/g, "''");
}
postMap.pid = entry.id[0].split('-').pop()
var published = entry.published;
var draft = 'false';
if(entry['app:control'] && (entry['app:control'][0]['app:draft'][0] == 'yes')){
draft = 'true';
}
console.log(`title: "${title}"`);
console.log(`date: ${published}`);
console.log(`draft: ${draft}`);
var sanitizedTitle = getFileName(title)
var urlLink = entry.link.filter(function(link){
return link["$"].type && link["$"].rel && link["$"].rel=='alternate' && link["$"].type=='text/html'
});
var url=''
// console.dir(urlLink[0]);
if (urlLink && urlLink[0] && urlLink[0]['$'] && urlLink[0]['$'].href){
url = urlLink[0]['$'].href;
}
var fname = outputDir + '/' + path.basename(sanitizedTitle) + '.md';
console.log(fname);
postMap.postName = fname
postMap.fname = fname.replace('.md', '-comments.md');
postMap.comments = [];
if (entry.content && entry.content[0] && entry.content[0]['_']){
// console.log('content available');
content = entry.content[0]['_'];
markdown = tds.turndown(content);
// console.log(markdown);
}
var tagLabel = [];
var tags = [];
tagLabel = entry.category.filter(function (tag){
// console.log(`tagged against :${tag['$'].term}`);
return tag['$'].term && tag['$'].term.indexOf('http://schemas.google')==-1;
});
console.log(`No of category: ${entry.category.length}`);
tagLabel.forEach(function(tag){
// console.log(`tagged against :${tag['$'].term}`);
tags.push(tag['$'].term);
});
console.log(`tags: \n${tags.map(a=> '- '+a).join('\n')}\n`);
var tagString='';
if(tags.length){
tagString=`tags: \n${tags.map(a=> '- '+a).join('\n')}\n`;
}
console.dir(postMap);
console.log("\n\n\n\n\n");
var link = url.replace(/^.*\/\/[^\/]+/, '');
fileHeader = ---\ntitle: "${title}"\ndescription: "${seoDescription}"\ndate: ${published}\ndraft: ${draft}\npost type: ${postType}\nurl: ${link}\nimage: ${seoImage}\n${tagString}\n${categoryString}---\n
;
fileContent = ${fileHeader}\n${markdown}
;
postMap.header = fileHeader;
postMaps[postMap.pid] = postMap;
writeToFile(fname, fileContent)
});
comments.forEach(function(entry){
// var commentMap = {};
var comment = {published:'', title:'', content:''};
var postId = entry['thr:in-reply-to'][0]["$"]["source"];
postId = path.basename(postId);
comment.published = entry['published'][0];
if(entry['title'][0] && entry['title'][0]["_"]){
comment.title = tds.turndown(entry['title'][0]["_"]);
}
if (entry['content'][0] && entry['content'][0]["_"]){
comment.content = tds.turndown(entry['content'][0]["_"]);
}
comment.author = {name: '', email: '', url: ''};
if(entry['author'][0]["name"] && entry['author'][0]["name"][0]){
comment.author.name = entry['author'][0]["name"][0];
}
if (entry['author'][0]["email"] && entry['author'][0]["email"][0]){
comment.author.email = entry['author'][0]["email"][0];
}
if (entry['author'][0]["uri"] && entry['author'][0]["uri"][0]){
comment.author.url = entry['author'][0]["uri"][0];
}
postMaps[postId].comments.push(comment);
});
// console.log(JSON.stringify(postMaps)); return;
writeComments(postMaps);
}
console.log('Done');
});
});
}
function writeComments(postMaps){
if (mergeComments == 'm'){
console.log('DEBUG: merge comments requested');
}else{
console.log('DEBUG: separate comments requested (defaulted)');
}
for (var pmap in postMaps){
var comments = postMaps[pmap].comments;
console.log(`post id: ${pmap} has ${comments.length} comments`);
// console.dir(comments);
if (comments.length){
var ccontent = '';
comments.forEach(function(comment){
var readableDate = '<time datetime="'+comment.published+'">' + moment(comment.published).format("MMM d, YYYY") + '</time>';
ccontent += `#### ${comment.title}\n[${comment.author.name}](${comment.author.url} "${comment.author.email}") - ${readableDate}\n\n${comment.content}\n<hr />\n`;
});
if (mergeComments == 'm'){
writeToFile(postMaps[pmap].postName, `\n---\n### Comments:\n${ccontent}`, true);
}else{
writeToFile(postMaps[pmap].fname, `${postMaps[pmap].header}\n${ccontent}`);
}
}
}
}
function writeToFile(filename, content, append=false){
if(append){
console.log(`DEBUG: going to append to ${filename}`);
try{
fs.appendFileSync(filename, content);
console.log(`Successfully appended to ${filename}`);
}
catch(err){
console.log(`Error while appending to ${filename} - ${JSON.stringify(err)}`);
console.dir(err);
}
}else{
console.log(`DEBUG: going to write to ${filename}`);
try{
fs.writeFileSync(filename, content);
console.log(`Successfully written to ${filename}`);
}
catch(err){
console.log(`Error while writing to ${filename} - ${JSON.stringify(err)}`);
console.dir(err);
}
}
}