html_tree_converter.service.js (3116B)
- import { getTagName } from './utility.service.js'
- import { unescape } from 'lodash'
- /**
- * This is a not-so-tiny purpose-built HTML parser/processor. This parses html
- * and converts it into a tree structure representing tag openers/closers and
- * children.
- *
- * Structure follows this pattern: [opener, [...children], closer] except root
- * node which is just [...children]. Text nodes can only be within children and
- * are represented as strings.
- *
- * Intended use is to convert HTML structure and then recursively iterate over it
- * most likely using a map. Very useful for dynamically rendering html replacing
- * tags with JSX elements in a render function.
- *
- * known issue: doesn't handle CDATA so CDATA might not work well
- * known issue: doesn't handle HTML comments
- *
- * @param {Object} input - input data
- * @return {string} processed html
- */
- export const convertHtmlToTree = (html = '') => {
- // Elements that are implicitly self-closing
- // https://developer.mozilla.org/en-US/docs/Glossary/empty_element
- const emptyElements = new Set([
- 'area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input',
- 'keygen', 'link', 'meta', 'param', 'source', 'track', 'wbr'
- ])
- // TODO For future - also parse HTML5 multi-source components?
- const buffer = [] // Current output buffer
- const levels = [['', buffer]] // How deep we are in tags and which tags were there
- let textBuffer = '' // Current line content
- let tagBuffer = null // Current tag buffer, if null = we are not currently reading a tag
- const getCurrentBuffer = () => {
- return levels[levels.length - 1][1]
- }
- const flushText = () => { // Processes current line buffer, adds it to output buffer and clears line buffer
- if (textBuffer === '') return
- getCurrentBuffer().push(textBuffer)
- textBuffer = ''
- }
- const handleSelfClosing = (tag) => {
- getCurrentBuffer().push([tag])
- }
- const handleOpen = (tag) => {
- const curBuf = getCurrentBuffer()
- const newLevel = [unescape(tag), []]
- levels.push(newLevel)
- curBuf.push(newLevel)
- }
- const handleClose = (tag) => {
- const currentTag = levels[levels.length - 1]
- if (getTagName(levels[levels.length - 1][0]) === getTagName(tag)) {
- currentTag.push(tag)
- levels.pop()
- } else {
- getCurrentBuffer().push(tag)
- }
- }
- for (let i = 0; i < html.length; i++) {
- const char = html[i]
- if (char === '<' && tagBuffer === null) {
- flushText()
- tagBuffer = char
- } else if (char !== '>' && tagBuffer !== null) {
- tagBuffer += char
- } else if (char === '>' && tagBuffer !== null) {
- tagBuffer += char
- const tagFull = tagBuffer
- tagBuffer = null
- const tagName = getTagName(tagFull)
- if (tagFull[1] === '/') {
- handleClose(tagFull)
- } else if (emptyElements.has(tagName) || tagFull[tagFull.length - 2] === '/') {
- // self-closing
- handleSelfClosing(tagFull)
- } else {
- handleOpen(tagFull)
- }
- } else {
- textBuffer += char
- }
- }
- if (tagBuffer) {
- textBuffer += tagBuffer
- }
- flushText()
- return buffer
- }