Last updated
HTML to Plain Text Conversion
Converting HTML to plain text strips all markup and returns only the visible text content. This is useful for generating email plain-text alternatives, extracting content for search indexing, creating text previews, or processing HTML content with text analysis tools. The challenge is preserving the semantic structure — headings, lists, and paragraphs should be represented meaningfully in plain text.
Conversion Strategies
| HTML Element | Plain Text Representation |
|---|---|
<h1>–<h6> | Text in uppercase or with === underline |
<p> | Text followed by double newline |
<br> | Single newline |
<li> | • or - prefix |
<a href="..."> | Text (URL) or just text |
<img alt="..."> | [alt text] or (empty) |
<table> | Tab-separated or ASCII table |
JavaScript Implementation
function htmlToText(html) {
// Use DOM parser for accurate parsing
const doc = new DOMParser().parseFromString(html, 'text/html');
function nodeToText(node) {
if (node.nodeType === Node.TEXT_NODE) return node.textContent;
if (node.nodeType !== Node.ELEMENT_NODE) return '';
const tag = node.tagName.toLowerCase();
const children = Array.from(node.childNodes).map(nodeToText).join('');
switch (tag) {
case 'h1': case 'h2': case 'h3':
return `
${children.toUpperCase()}
`;
case 'p': case 'div': return `
${children}
`;
case 'br': return '
';
case 'li': return `
• ${children}`;
case 'a': return `${children} (${node.href})`;
case 'img': return node.alt ? `[${node.alt}]` : '';
case 'script': case 'style': return '';
default: return children;
}
}
return nodeToText(doc.body)
.replace(/
{3,}/g, '
')
.trim();
}