Wordpress to Hugo export of Tags and Categories for Yoast or Rankmath with instructions on how to do it with chatgpt

‘use strict’;

/***
Usage: blog2md b|w <BLOGGER/WordPress BACKUP XML>

*/

const fs = require(‘fs’);
const os = require(‘os’);
const path = require(‘path’);
const xml2js = require(‘xml2js’);
const sanitize = require(‘sanitize-filename’);
const TurndownService = require(‘turndown’);
var moment = require(‘moment’);

var tds = new TurndownService({ codeBlockStyle: ‘fenced’, fence: ‘```’ });

tds.addRule(‘wppreblock’, {
filter: [‘pre’],
replacement: function(content) {
return ‘\n' + content + '\n’;
}
});

// console.log(No. of arguments passed: ${process.argv.length});

if (process.argv.length < 5) {
// ${process.argv[1]}
console.log(Usage: blog2md [b|w] <BACKUP XML> <OUTPUT DIR> m|s);
console.log(\t b for parsing Blogger(Blogspot) backup);
console.log(\t w for parsing WordPress backup);
process.exit(1);
}

var option = process.argv[2];
var inputFile = process.argv[3];

var outputDir = process.argv[4];

var mergeComments = (process.argv[5] == ‘m’) ? ‘m’ : ‘s’;
/** Apply a fix to WordPress posts to convert newlines to paragraphs. */
var applyParagraphFix = (process.argv.indexOf(‘paragraph-fix’) >= 0);

if (fs.existsSync(outputDir)) {
console.log(WARNING: Given output directory "${outputDir}" already exists. Files will be overwritten.);
} else {
fs.mkdirSync(outputDir);
}

if (mergeComments == ‘m’) {
console.log(INFO: Comments requested to be merged along with posts. (m));
} else {
console.log(INFO: Comments requested to be a separate .md file(m - default));
}

if (option.toLowerCase() == ‘b’) {
bloggerImport(inputFile, outputDir);
} else if (option.toLowerCase() == ‘w’) {
wordpressImport(inputFile, outputDir);
} else {
console.log(‘Only b (Blogger) and w (WordPress) are valid options’);
process.exit(1);
}

function wordpressImport(backupXmlFile, outputDir) {
var parser = new xml2js.Parser();

fs.readFile(backupXmlFile, function(err, data) {
    parser.parseString(data, function(err, result) {
        if (err) {
            console.log(`Error parsing xml file (${backupXmlFile})\n${JSON.stringify(err)}`);
            process.exit(1);
        }
        // console.dir(result);
        // console.log(JSON.stringify(result)); return;
        var posts = [];
        var seoDescription = '';
        // try {
        posts = result.rss.channel[0].item;

        
        console.log(`Total Post count: ${posts.length}`);

        posts = posts.filter(function(post) {
            var status = '';
            if (post["wp:status"]) {
                status = post["wp:status"].join('');
            }
            // console.log(post["wp:status"].join(''));
            return status != "private" && status != "inherit";
        });

        

        // console.log(posts)
        console.log(`Post count: ${posts.length}`);
        var permalink = '';
        var title = '';
        var content = '';
        var description = '';
        var author = '';
        var category = [];
        var link = '';
        var attachment = '';
        var attachmentUrl = ''
        var tags = [];
        var draft = false;
        var published = '';
        var comments = [];
        var fname = '';
        var markdown = '';
        var fileContent = '';
        var fileHeader = '';
        var postMaps = {};
        var seoImage= '';

        
        posts.forEach(function(post) {
            var postMap = {};
            // Check for different description fields
            
            if (post["wp:postmeta"]) {
                try {
                    post["wp:postmeta"].forEach(function(meta) {
                        if (meta["wp:meta_key"] && meta["wp:meta_key"][0] === "_yoast_wpseo_metadesc") {
                            seoDescription = meta["wp:meta_value"][0];
                        } else if (meta["wp:meta_key"] && meta["wp:meta_key"][0] === "_seopressor_og_image") {
                            seoImage = extractRelativePermalink(meta["wp:meta_value"][0]);
                        }
                    });
                } catch (error) {
                    console.error('Error processing post meta:', error);
                }
            }
            

            title = post.title[0].trim();
            description = post.description[0].trim();
            link = extractRelativePermalink(post.link[0]);
            var postType = '';
            if (post["wp:post_type"]) {
                postType = post["wp:post_type"][0];
                console.log(`Post type: ${postType}`);
            }

            // console.log(title);

            // if (title && title.indexOf("'")!=-1){
            title = title.replace(/'/g, "''");
            // }

            draft = post["wp:status"] == "draft";
            published = post.pubDate;
            comments = post['wp:comment'];
            fname = sanitize(decodeURI(post["wp:post_name"][0])) || post["wp:post_id"];
            markdown = '';
            // if (post.guid && post.guid[0] && post.guid[0]['_']){
            //     fname = path.basename(post.guid[0]['_']);
            // }
            // console.log(comments);

            console.log(`\n\n\n\ntitle: '${title}'`);
            console.log(`published: '${published}'`);
            console.log(`description: '${seoDescription}'`);
            console.log(`attachementUrl: '${attachmentUrl}'`);
            console.log(`permalink: '${link}'`);


            if (comments) {
                console.log(`comments: '${comments.length}'`);
            }

            var tags = [];
            var categories = [];
            var tagString = '';
            var categoryString = '';

            if (post.category && post.category.length) {
                post.category.forEach(function(category) {
                    if (category.$.domain === 'post_tag') {
                        tags.push(category['_']);
                    } else if (category.$.domain === 'category') {
                        categories.push(category['_']);
                    }
                });
            }

            if (tags.length) {
                tagString = 'tags: [\'' + tags.join("', '") + "']\n";
            }

            if (categories.length) {
                categoryString = 'categories: [\'' + categories.join("', '") + "']\n";
            }

            console.log(tagString);
            console.log(categoryString);


            var pmap = { fname: '', comments: [] };
            pmap.fname = outputDir + '/' + fname + '-comments.md';

            fname = outputDir + '/' + fname + '.md';
            pmap.postName = fname;
            console.log(`fname: '${fname}'`);

            if (post["content:encoded"]) {
                // console.log('content available');
                var postContent = post["content:encoded"].toString();
                if (applyParagraphFix && !/<p>/i.test(postContent)) {
                    postContent = '<p>' + postContent.replace(/(\r?\n){2}/g, '</p>\n\n<p>') + '</p>';
                }
                content = '<div>' + postContent + '</div>'; //to resolve error if plain text returned
                markdown = tds.turndown(content);
                // console.log(markdown);
                fileHeader = `---\ntitle: "${title}"\ndescription: "${seoDescription}"\ndate: ${published}\ndraft: ${draft}\npost type: ${postType}\nurl: ${link}\nimage: ${seoImage}\n${tagString}\n${categoryString}---\n`;
                fileContent = `${fileHeader}\n${markdown}`;
                pmap.header = `${fileHeader}\n`;

                writeToFile(fname, fileContent);

            }

            //comments:
            /*
                "wp:comment" [.each]
                    wp:comment_author[0]
                    wp:comment_author_email[0]
                    wp:comment_author_url[0]
                    wp:comment_date[0]
                    wp:comment_content[0]
                    wp:comment_approved[0] == 1
                wp:post_id

            */
            var comments = post["wp:comment"] || [];
            // console.dir(comments);
            var anyApprovedComments = 0;
            var ccontent = '';
            comments.forEach(function(comment) {
                // console.log('')
                if (comment["wp:comment_approved"].pop()) {
                    anyApprovedComments = 1;

                    var cmt = { title: '', published: '', content: '', author: {} };

                    cmt.published = (comment["wp:comment_date"] ? comment["wp:comment_date"].pop() : '');

                    var cont = '<div>' + comment["wp:comment_content"].pop() + '</div>';
                    cmt.content = (comment["wp:comment_content"] ? tds.turndown(cont) : '');

                    cmt.author.name = (comment["wp:comment_author"] ? comment["wp:comment_author"].pop() : '');
                    cmt.author.email = (comment["wp:comment_author_email"] ? comment["wp:comment_author_email"].pop() : '');
                    cmt.author.url = (comment["wp:comment_author_url"] ? comment["wp:comment_author_url"].pop() : '');

                    ccontent += `#### [${cmt.author.name}](${cmt.author.url} "${cmt.author.email}") - ${cmt.published}\n\n${cmt.content}\n<hr />\n`;

                    pmap.comments.push(cmt);
                }
            });

            //just a hack to re-use blogger writecomments method
            if (pmap && pmap.comments && pmap.comments.length) {
                writeComments({ "0": pmap });
            }

        });

    });
});

}

// Helper function to extract description from encoded content
function getDescriptionFromEncodedContent(content) {
var parser = new xml2js.Parser({ explicitArray: false });
var description = ‘’;

parser.parseString(content, function(err, result) {
    if (result && result["wp:meta_value"]) {
        description = result["wp:meta_value"];
    }
});

return description;

}

// Helper function to find attachment by ID
function findAttachmentById(items, id) {
for (var i = 0; i < items.length; i++) {
if (items[i][“wp:post_type”] && items[i][“wp:post_type”][0] === “attachment” &&
items[i][“wp:post_id”] && items[i][“wp:post_id”][0] === id) {
return items[i];
}
}
return null;
}

// Helper function to extract relative image path
function extractRelativeImagePath(imageUrl) {
var urlObj = new URL(imageUrl);
return urlObj.pathname;
}

// Helper function to extract relative permalink
function extractRelativePermalink(permalink) {
try {
var urlObj = new URL(permalink);
return urlObj.pathname;
} catch (error) {
// Return the original value if it’s not a valid URL
return permalink;
}
}

function getFileName(text) {
var newFileName = sanitize(text) // first remove any dodgy characters
.replace(/[.']/g, ‘’) // then remove some known characters
.replace(/[^a-z0-9]/gi, ‘-’) // then turn anything that isn’t a number or letter into a hyphen
.replace(/[-]{2,}/g, ‘-’) // then turn multiple hyphens into a single one
.toLowerCase(); // finally make it all lower case
return newFileName;
}

function bloggerImport(backupXmlFile, outputDir){
var parser = new xml2js.Parser();
// __dirname + ‘/foo.xml’
fs.readFile(backupXmlFile, function(err, data) {
parser.parseString(data, function (err, result) {
if (err){
console.log(Error parsing xml file (${backupXmlFile})\n${JSON.stringify(err)}); return 1;
}
// console.dir(JSON.stringify(result)); return;

        if(result.feed && result.feed.entry) {
            var contents = result.feed.entry;
            console.log(`Total no. of entries found : ${contents.length}`);
            // var i=0
            var posts = contents.filter(function(entry){
                return entry.id[0].indexOf('.post-')!=-1 && !entry['thr:in-reply-to']
            });

            var comments = contents.filter(function(entry){
                return entry.id[0].indexOf('.post-')!=-1 && entry['thr:in-reply-to']
            });

            // console.dir(posts);

            console.log(`Content-posts ${posts.length}`);
            console.log(`Content-Comments ${comments.length}`);

             var content = '';
             var markdown = '';
             var fileContent = '';
             var fileHeader = '';
             var postMaps = {};

            posts.forEach(function(entry){
                var postMap = {};
                
                var title = entry.title[0]['_'];
                // title = tds.turndown(title);
                if (title && title.indexOf("'")!=-1){
                     title = title.replace(/'/g, "''");
                }
                postMap.pid = entry.id[0].split('-').pop()

                var published = entry.published;
                var draft = 'false';
                if(entry['app:control'] && (entry['app:control'][0]['app:draft'][0] == 'yes')){
                    draft =  'true';
                }

                console.log(`title: "${title}"`);
                console.log(`date: ${published}`);
                console.log(`draft: ${draft}`);
                
                var sanitizedTitle = getFileName(title)

                var urlLink = entry.link.filter(function(link){
                    return link["$"].type && link["$"].rel && link["$"].rel=='alternate' && link["$"].type=='text/html'
                });

                var url=''

                // console.dir(urlLink[0]);
                if (urlLink && urlLink[0] && urlLink[0]['$'] && urlLink[0]['$'].href){
                    url = urlLink[0]['$'].href;
                }

                var fname = outputDir + '/' + path.basename(sanitizedTitle) + '.md';
                console.log(fname);
                postMap.postName = fname
                postMap.fname = fname.replace('.md', '-comments.md');
                postMap.comments = [];


                if (entry.content && entry.content[0] && entry.content[0]['_']){
                    // console.log('content available');
                    content = entry.content[0]['_'];
                    markdown = tds.turndown(content);
                    // console.log(markdown);

                    
                }

                var tagLabel = [];
                var tags = [];

                
                tagLabel = entry.category.filter(function (tag){
                    // console.log(`tagged against :${tag['$'].term}`);
                    return tag['$'].term && tag['$'].term.indexOf('http://schemas.google')==-1;
                });
                console.log(`No of category: ${entry.category.length}`);
                tagLabel.forEach(function(tag){
                    // console.log(`tagged against :${tag['$'].term}`);
                    tags.push(tag['$'].term);
                });
                

                console.log(`tags: \n${tags.map(a=> '- '+a).join('\n')}\n`);

                var tagString='';

                if(tags.length){
                    tagString=`tags: \n${tags.map(a=> '- '+a).join('\n')}\n`;
                }

                console.dir(postMap);

                console.log("\n\n\n\n\n");

                var link = url.replace(/^.*\/\/[^\/]+/, '');

fileHeader = ---\ntitle: "${title}"\ndescription: "${seoDescription}"\ndate: ${published}\ndraft: ${draft}\npost type: ${postType}\nurl: ${link}\nimage: ${seoImage}\n${tagString}\n${categoryString}---\n;
fileContent = ${fileHeader}\n${markdown};

                postMap.header = fileHeader;
                postMaps[postMap.pid] = postMap;

                writeToFile(fname, fileContent)
                
            });


        comments.forEach(function(entry){
            // var commentMap = {};
            var comment = {published:'', title:'', content:''};

            var postId = entry['thr:in-reply-to'][0]["$"]["source"];
            postId = path.basename(postId);

            comment.published = entry['published'][0];

            if(entry['title'][0] && entry['title'][0]["_"]){
                comment.title = tds.turndown(entry['title'][0]["_"]);    
            }

            if (entry['content'][0] && entry['content'][0]["_"]){
                comment.content = tds.turndown(entry['content'][0]["_"]);    
            }
            
            comment.author = {name: '', email: '', url: ''};
            
            if(entry['author'][0]["name"] && entry['author'][0]["name"][0]){
                comment.author.name = entry['author'][0]["name"][0];    
            }
            
            if (entry['author'][0]["email"] && entry['author'][0]["email"][0]){
                comment.author.email = entry['author'][0]["email"][0];    
            }
            
            if (entry['author'][0]["uri"] && entry['author'][0]["uri"][0]){
                comment.author.url = entry['author'][0]["uri"][0];    
            }
            
            postMaps[postId].comments.push(comment);
        });

        // console.log(JSON.stringify(postMaps)); return;
        writeComments(postMaps);
       
        }
        console.log('Done');
    });

});

}

function writeComments(postMaps){

if (mergeComments == 'm'){
    console.log('DEBUG: merge comments requested');
}else{
    console.log('DEBUG: separate comments requested (defaulted)');
}
for (var pmap in postMaps){
    var comments = postMaps[pmap].comments;
    console.log(`post id: ${pmap} has ${comments.length} comments`);
    // console.dir(comments);

    if (comments.length){
        var ccontent = '';
        comments.forEach(function(comment){
            var readableDate = '<time datetime="'+comment.published+'">' + moment(comment.published).format("MMM d, YYYY") + '</time>';

            ccontent += `#### ${comment.title}\n[${comment.author.name}](${comment.author.url} "${comment.author.email}") - ${readableDate}\n\n${comment.content}\n<hr />\n`;
        });

        if (mergeComments == 'm'){
            writeToFile(postMaps[pmap].postName, `\n---\n### Comments:\n${ccontent}`, true);
        }else{
            writeToFile(postMaps[pmap].fname, `${postMaps[pmap].header}\n${ccontent}`);
        }
        
    }
}

}

function writeToFile(filename, content, append=false){

if(append){
    console.log(`DEBUG: going to append to ${filename}`);
    try{
        fs.appendFileSync(filename, content);
        console.log(`Successfully appended to ${filename}`);
    }
    catch(err){
        console.log(`Error while appending to ${filename} - ${JSON.stringify(err)}`);
        console.dir(err);
    }

}else{
    console.log(`DEBUG: going to write to ${filename}`);
    try{
        fs.writeFileSync(filename, content);
        console.log(`Successfully written to ${filename}`);
    }
    catch(err){
        console.log(`Error while writing to ${filename} - ${JSON.stringify(err)}`);
        console.dir(err);
    }
}

}

I will be giving a hugo talk on how to do this and keep your standing seo wise on google. Read at your own peril.

Perhaps you could fix the formatting of your code meanwhile.
```
code goes here
```

And provide some more detail as to how all that works. I suppose its using node.js, but there’s no indication…

1 Like