194 lines
5.0 KiB
JavaScript
194 lines
5.0 KiB
JavaScript
var generate = require('regjsgen').generate;
|
|
var parse = require('regjsparser').parse;
|
|
var regenerate = require('regenerate');
|
|
var iuMappings = require('./data/iu-mappings.json');
|
|
var ESCAPE_SETS = require('./data/character-class-escape-sets.js');
|
|
|
|
function getCharacterClassEscapeSet(character) {
|
|
if (unicode) {
|
|
if (ignoreCase) {
|
|
return ESCAPE_SETS.UNICODE_IGNORE_CASE[character];
|
|
}
|
|
return ESCAPE_SETS.UNICODE[character];
|
|
}
|
|
return ESCAPE_SETS.REGULAR[character];
|
|
}
|
|
|
|
var object = {};
|
|
var hasOwnProperty = object.hasOwnProperty;
|
|
function has(object, property) {
|
|
return hasOwnProperty.call(object, property);
|
|
}
|
|
|
|
// Prepare a Regenerate set containing all code points, used for negative
|
|
// character classes (if any).
|
|
var UNICODE_SET = regenerate().addRange(0x0, 0x10FFFF);
|
|
// Without the `u` flag, the range stops at 0xFFFF.
|
|
// https://mths.be/es6#sec-pattern-semantics
|
|
var BMP_SET = regenerate().addRange(0x0, 0xFFFF);
|
|
|
|
// Prepare a Regenerate set containing all code points that are supposed to be
|
|
// matched by `/./u`. https://mths.be/es6#sec-atom
|
|
var DOT_SET_UNICODE = UNICODE_SET.clone() // all Unicode code points
|
|
.remove(
|
|
// minus `LineTerminator`s (https://mths.be/es6#sec-line-terminators):
|
|
0x000A, // Line Feed <LF>
|
|
0x000D, // Carriage Return <CR>
|
|
0x2028, // Line Separator <LS>
|
|
0x2029 // Paragraph Separator <PS>
|
|
);
|
|
// Prepare a Regenerate set containing all code points that are supposed to be
|
|
// matched by `/./` (only BMP code points).
|
|
var DOT_SET = DOT_SET_UNICODE.clone()
|
|
.intersection(BMP_SET);
|
|
|
|
// Add a range of code points + any case-folded code points in that range to a
|
|
// set.
|
|
regenerate.prototype.iuAddRange = function(min, max) {
|
|
var $this = this;
|
|
do {
|
|
var folded = caseFold(min);
|
|
if (folded) {
|
|
$this.add(folded);
|
|
}
|
|
} while (++min <= max);
|
|
return $this;
|
|
};
|
|
|
|
function assign(target, source) {
|
|
for (var key in source) {
|
|
// Note: `hasOwnProperty` is not needed here.
|
|
target[key] = source[key];
|
|
}
|
|
}
|
|
|
|
function update(item, pattern) {
|
|
// TODO: Test if memoizing `pattern` here is worth the effort.
|
|
if (!pattern) {
|
|
return;
|
|
}
|
|
var tree = parse(pattern, '');
|
|
switch (tree.type) {
|
|
case 'characterClass':
|
|
case 'group':
|
|
case 'value':
|
|
// No wrapping needed.
|
|
break;
|
|
default:
|
|
// Wrap the pattern in a non-capturing group.
|
|
tree = wrap(tree, pattern);
|
|
}
|
|
assign(item, tree);
|
|
}
|
|
|
|
function wrap(tree, pattern) {
|
|
// Wrap the pattern in a non-capturing group.
|
|
return {
|
|
'type': 'group',
|
|
'behavior': 'ignore',
|
|
'body': [tree],
|
|
'raw': '(?:' + pattern + ')'
|
|
};
|
|
}
|
|
|
|
function caseFold(codePoint) {
|
|
return has(iuMappings, codePoint) ? iuMappings[codePoint] : false;
|
|
}
|
|
|
|
var ignoreCase = false;
|
|
var unicode = false;
|
|
function processCharacterClass(characterClassItem) {
|
|
var set = regenerate();
|
|
var body = characterClassItem.body.forEach(function(item) {
|
|
switch (item.type) {
|
|
case 'value':
|
|
set.add(item.codePoint);
|
|
if (ignoreCase && unicode) {
|
|
var folded = caseFold(item.codePoint);
|
|
if (folded) {
|
|
set.add(folded);
|
|
}
|
|
}
|
|
break;
|
|
case 'characterClassRange':
|
|
var min = item.min.codePoint;
|
|
var max = item.max.codePoint;
|
|
set.addRange(min, max);
|
|
if (ignoreCase && unicode) {
|
|
set.iuAddRange(min, max);
|
|
}
|
|
break;
|
|
case 'characterClassEscape':
|
|
set.add(getCharacterClassEscapeSet(item.value));
|
|
break;
|
|
// The `default` clause is only here as a safeguard; it should never be
|
|
// reached. Code coverage tools should ignore it.
|
|
/* istanbul ignore next */
|
|
default:
|
|
throw Error('Unknown term type: ' + item.type);
|
|
}
|
|
});
|
|
if (characterClassItem.negative) {
|
|
set = (unicode ? UNICODE_SET : BMP_SET).clone().remove(set);
|
|
}
|
|
update(characterClassItem, set.toString());
|
|
return characterClassItem;
|
|
}
|
|
|
|
function processTerm(item) {
|
|
switch (item.type) {
|
|
case 'dot':
|
|
update(
|
|
item,
|
|
(unicode ? DOT_SET_UNICODE : DOT_SET).toString()
|
|
);
|
|
break;
|
|
case 'characterClass':
|
|
item = processCharacterClass(item);
|
|
break;
|
|
case 'characterClassEscape':
|
|
update(
|
|
item,
|
|
getCharacterClassEscapeSet(item.value).toString()
|
|
);
|
|
break;
|
|
case 'alternative':
|
|
case 'disjunction':
|
|
case 'group':
|
|
case 'quantifier':
|
|
item.body = item.body.map(processTerm);
|
|
break;
|
|
case 'value':
|
|
var codePoint = item.codePoint;
|
|
var set = regenerate(codePoint);
|
|
if (ignoreCase && unicode) {
|
|
var folded = caseFold(codePoint);
|
|
if (folded) {
|
|
set.add(folded);
|
|
}
|
|
}
|
|
update(item, set.toString());
|
|
break;
|
|
case 'anchor':
|
|
case 'empty':
|
|
case 'group':
|
|
case 'reference':
|
|
// Nothing to do here.
|
|
break;
|
|
// The `default` clause is only here as a safeguard; it should never be
|
|
// reached. Code coverage tools should ignore it.
|
|
/* istanbul ignore next */
|
|
default:
|
|
throw Error('Unknown term type: ' + item.type);
|
|
}
|
|
return item;
|
|
};
|
|
|
|
module.exports = function(pattern, flags) {
|
|
var tree = parse(pattern, flags);
|
|
ignoreCase = flags ? flags.indexOf('i') > -1 : false;
|
|
unicode = flags ? flags.indexOf('u') > -1 : false;
|
|
assign(tree, processTerm(tree));
|
|
return generate(tree);
|
|
};
|