var generate = require('regjsgen').generate; var parse = require('regjsparser').parse; var regenerate = require('regenerate'); var iuMappings = require('./data/iu-mappings.json'); var ESCAPE_SETS = require('./data/character-class-escape-sets.js'); function getCharacterClassEscapeSet(character) { if (unicode) { if (ignoreCase) { return ESCAPE_SETS.UNICODE_IGNORE_CASE[character]; } return ESCAPE_SETS.UNICODE[character]; } return ESCAPE_SETS.REGULAR[character]; } var object = {}; var hasOwnProperty = object.hasOwnProperty; function has(object, property) { return hasOwnProperty.call(object, property); } // Prepare a Regenerate set containing all code points, used for negative // character classes (if any). var UNICODE_SET = regenerate().addRange(0x0, 0x10FFFF); // Without the `u` flag, the range stops at 0xFFFF. // https://mths.be/es6#sec-pattern-semantics var BMP_SET = regenerate().addRange(0x0, 0xFFFF); // Prepare a Regenerate set containing all code points that are supposed to be // matched by `/./u`. https://mths.be/es6#sec-atom var DOT_SET_UNICODE = UNICODE_SET.clone() // all Unicode code points .remove( // minus `LineTerminator`s (https://mths.be/es6#sec-line-terminators): 0x000A, // Line Feed 0x000D, // Carriage Return 0x2028, // Line Separator 0x2029 // Paragraph Separator ); // Prepare a Regenerate set containing all code points that are supposed to be // matched by `/./` (only BMP code points). var DOT_SET = DOT_SET_UNICODE.clone() .intersection(BMP_SET); // Add a range of code points + any case-folded code points in that range to a // set. regenerate.prototype.iuAddRange = function(min, max) { var $this = this; do { var folded = caseFold(min); if (folded) { $this.add(folded); } } while (++min <= max); return $this; }; function assign(target, source) { for (var key in source) { // Note: `hasOwnProperty` is not needed here. target[key] = source[key]; } } function update(item, pattern) { // TODO: Test if memoizing `pattern` here is worth the effort. if (!pattern) { return; } var tree = parse(pattern, ''); switch (tree.type) { case 'characterClass': case 'group': case 'value': // No wrapping needed. break; default: // Wrap the pattern in a non-capturing group. tree = wrap(tree, pattern); } assign(item, tree); } function wrap(tree, pattern) { // Wrap the pattern in a non-capturing group. return { 'type': 'group', 'behavior': 'ignore', 'body': [tree], 'raw': '(?:' + pattern + ')' }; } function caseFold(codePoint) { return has(iuMappings, codePoint) ? iuMappings[codePoint] : false; } var ignoreCase = false; var unicode = false; function processCharacterClass(characterClassItem) { var set = regenerate(); var body = characterClassItem.body.forEach(function(item) { switch (item.type) { case 'value': set.add(item.codePoint); if (ignoreCase && unicode) { var folded = caseFold(item.codePoint); if (folded) { set.add(folded); } } break; case 'characterClassRange': var min = item.min.codePoint; var max = item.max.codePoint; set.addRange(min, max); if (ignoreCase && unicode) { set.iuAddRange(min, max); } break; case 'characterClassEscape': set.add(getCharacterClassEscapeSet(item.value)); break; // The `default` clause is only here as a safeguard; it should never be // reached. Code coverage tools should ignore it. /* istanbul ignore next */ default: throw Error('Unknown term type: ' + item.type); } }); if (characterClassItem.negative) { set = (unicode ? UNICODE_SET : BMP_SET).clone().remove(set); } update(characterClassItem, set.toString()); return characterClassItem; } function processTerm(item) { switch (item.type) { case 'dot': update( item, (unicode ? DOT_SET_UNICODE : DOT_SET).toString() ); break; case 'characterClass': item = processCharacterClass(item); break; case 'characterClassEscape': update( item, getCharacterClassEscapeSet(item.value).toString() ); break; case 'alternative': case 'disjunction': case 'group': case 'quantifier': item.body = item.body.map(processTerm); break; case 'value': var codePoint = item.codePoint; var set = regenerate(codePoint); if (ignoreCase && unicode) { var folded = caseFold(codePoint); if (folded) { set.add(folded); } } update(item, set.toString()); break; case 'anchor': case 'empty': case 'group': case 'reference': // Nothing to do here. break; // The `default` clause is only here as a safeguard; it should never be // reached. Code coverage tools should ignore it. /* istanbul ignore next */ default: throw Error('Unknown term type: ' + item.type); } return item; }; module.exports = function(pattern, flags) { var tree = parse(pattern, flags); ignoreCase = flags ? flags.indexOf('i') > -1 : false; unicode = flags ? flags.indexOf('u') > -1 : false; assign(tree, processTerm(tree)); return generate(tree); };