strip_html.js 2.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125
  1. 'use strict';
  2. const STATE_PLAINTEXT = Symbol('plaintext');
  3. const STATE_HTML = Symbol('html');
  4. const STATE_COMMENT = Symbol('comment');
  5. function striptags(html = '') {
  6. // if not string, then safely return an empty string
  7. if (typeof html !== 'string' && !(html instanceof String)) {
  8. return '';
  9. }
  10. let state = STATE_PLAINTEXT;
  11. let tag_buffer = '';
  12. let depth = 0;
  13. let in_quote_char = '';
  14. let output = '';
  15. const { length } = html;
  16. for (let idx = 0; idx < length; idx++) {
  17. const char = html[idx];
  18. if (state === STATE_PLAINTEXT) {
  19. switch (char) {
  20. case '<':
  21. state = STATE_HTML;
  22. tag_buffer = tag_buffer + char;
  23. break;
  24. default:
  25. output += char;
  26. break;
  27. }
  28. } else if (state === STATE_HTML) {
  29. switch (char) {
  30. case '<':
  31. // ignore '<' if inside a quote
  32. if (in_quote_char) break;
  33. // we're seeing a nested '<'
  34. depth++;
  35. break;
  36. case '>':
  37. // ignore '>' if inside a quote
  38. if (in_quote_char) {
  39. break;
  40. }
  41. // something like this is happening: '<<>>'
  42. if (depth) {
  43. depth--;
  44. break;
  45. }
  46. // this is closing the tag in tag_buffer
  47. in_quote_char = '';
  48. state = STATE_PLAINTEXT;
  49. // tag_buffer += '>';
  50. tag_buffer = '';
  51. break;
  52. case '"':
  53. case '\'':
  54. // catch both single and double quotes
  55. if (char === in_quote_char) {
  56. in_quote_char = '';
  57. } else {
  58. in_quote_char = in_quote_char || char;
  59. }
  60. tag_buffer = tag_buffer + char;
  61. break;
  62. case '-':
  63. if (tag_buffer === '<!-') {
  64. state = STATE_COMMENT;
  65. }
  66. tag_buffer = tag_buffer + char;
  67. break;
  68. case ' ':
  69. case '\n':
  70. if (tag_buffer === '<') {
  71. state = STATE_PLAINTEXT;
  72. output += '< ';
  73. tag_buffer = '';
  74. break;
  75. }
  76. tag_buffer = tag_buffer + char;
  77. break;
  78. default:
  79. tag_buffer = tag_buffer + char;
  80. break;
  81. }
  82. } else if (state === STATE_COMMENT) {
  83. switch (char) {
  84. case '>':
  85. if (tag_buffer.slice(-2) === '--') {
  86. // close the comment
  87. state = STATE_PLAINTEXT;
  88. }
  89. tag_buffer = '';
  90. break;
  91. default:
  92. tag_buffer = tag_buffer + char;
  93. break;
  94. }
  95. }
  96. }
  97. return output;
  98. }
  99. module.exports = striptags;