* * For the full copyright and license information refer to the LICENSE file * distributed with this source code */ namespace Peast\Selector; /** * Selector parser class * * @author Marco MarchiĆ² */ class Parser { /** * Selector string * * @var string */ protected $selector; /** * Current parser index * * @var int */ protected $index = 0; /** * Selector length * * @var int */ protected $length; /** * Whitespaces * * @var array */ protected $whitespaces = array(" ", "\t", "\n", "\r", "\f"); /** * Combinators * * @var array */ protected $combinators = array(">", "+", "~"); /** * Attribute selector operator characters * * @var array */ protected $attrOperatorChars = array("=", "<", ">", "^", "$", "*"); /** * Attribute selector operators * * @var array */ protected $attrOperators = array("=", "<", ">", "<=", ">=", "^=", "$=", "*="); /** * Valid pseudo selectors. The value indicates the argument type: * - 0: no arguments * - 1: index formula (An+B syntax) * - 2: selector * @var array */ protected $validPseudo = array( "pattern" => 0, "statement" => 0, "expression" => 0, "declaration" => 0, "first-child" => 0, "last-child" => 0, "nth-child" => 1, "nth-last-child" => 1, "has" => 2, "is" => 2, "not" => 2 ); /** * Class constructor * * @param string $selector Selector string * @param array $options Options array. See Query class * documentation for available options */ public function __construct($selector, $options = array()) { $encoding = isset($options["encoding"]) ? $options["encoding"] : null; if ($encoding && !preg_match("/UTF-?8/i", $encoding)) { $selector = mb_convert_encoding($selector, "UTF-8", $encoding); } $this->selector = $selector; $this->length = strlen($selector); } /** * Starts the parsing and returns the parsed selector * * @param bool $filter True if the selector must be used for a filter * * @return Node\Selector * * @throws Exception */ public function parse($filter = false) { $selector = $this->parseSelector($filter); //Throw an exception if the end has not been reached if (($char = $this->getChar()) !== null) { throw new Exception("Invalid syntax '$char'"); } return $selector; } /** * Parses a selector * * @param bool $filter True if the selector must be used for a filter * * @return Node\Selector * * @throws Exception */ public function parseSelector($filter = false) { $selector = new Node\Selector; do { $first = true; $group = new Node\Group; while (true) { $combinator = $this->consumeCombinator(); if (!$first && !$combinator) { break; } $parts = $this->parseSelectorParts(); if (!count($parts)) { throw new Exception("Missing selector after combinator"); } $first = false; $selCombinator = new Node\Combinator; $selCombinator->setOperator( $combinator ?: ($filter ? null : " ") ); foreach ($parts as $part) { $selCombinator->addPart($part); } $group->addCombinator($selCombinator); } $selector->addGroup($group); $this->consumeWhitespaces(); } while ($this->consume(",")); return $selector; } /** * Parses a set of selector pats * * @return array * * @throws Exception */ protected function parseSelectorParts() { $parts = array(); while (true) { if ( ($part = $this->parseSelectorPartType()) || ($part = $this->parseSelectorPartAttribute()) || ($part = $this->parseSelectorPartPseudo()) ) { $parts[] = $part; } else { break; } } return $parts; } /** * Parses a type selector part * * @return Node\Part\Type|null */ protected function parseSelectorPartType() { $type = $this->consumeWord(); if ($type) { $part = new Node\Part\Type; $part->setType($type); return $part; } return null; } /** * Parses an attribute selector part * * @return Node\Part\Attribute|null * * @throws Exception */ protected function parseSelectorPartAttribute() { if (!$this->consume("[")) { return null; } $this->consumeWhitespaces(); $part = new Node\Part\Attribute; if (!($name = $this->consumeWord())) { throw new Exception("Missing attribute name"); } $part->addName($name); while ($this->consume(".")) { if (!($name = $this->consumeWord())) { throw new Exception("Missing attribute name after dot"); } $part->addName($name); } $this->consumeWhitespaces(); $operator = $this->consumeAny($this->attrOperatorChars); if ($operator) { if (!in_array($operator, $this->attrOperators)) { throw new Exception("Invalid attribute operator '$operator'"); } $part->setOperator($operator); $this->consumeWhitespaces(); if (!($value = $this->parseLiteral())) { throw new Exception("Missing attribute value"); } $part->setValue($value[0]); if ($value[1]) { if ($operator != "=") { throw new Exception( "Only '=' operator is valid for attribute regex match" ); } $part->setRegex(true); } $this->consumeWhitespaces(); if ($this->consume("i")) { if (!is_string($value[0]) || $value[1]) { throw new Exception( "Case insensitive flag can be used only for string values" ); } $part->setCaseInsensitive(true); $this->consumeWhitespaces(); } } if (!$this->consume("]")) { throw new Exception("Unterminated attribute selector"); } return $part; } /** * Parses a pseudo selector part * * @return Node\Part\Pseudo|null * * @throws Exception */ protected function parseSelectorPartPseudo() { if (!$this->consume(":")) { return null; } $name = $this->consumeWord("-"); if (!isset($this->validPseudo[$name])) { throw new Exception("Unsupported pseudo selector '$name'"); } $argsType = $this->validPseudo[$name]; $error = false; if ($argsType === 1) { $part = new Node\Part\PseudoIndex; if (!$this->consume("(")) { $error = true; } if (!$error) { $this->consumeWhitespaces(); if ($indices = $this->consumeRegex("-?\d*n(?:\+\d+)?|\d+")) { $indices = explode("n", $indices); if (count($indices) === 1) { $part->setOffset((int) $indices[0]); } else { switch ($indices[0]) { case "": $part->setStep(1); break; case "-": $part->setStep(-1); break; default: $part->setStep((int) $indices[0]); break; } if ($indices[1] !== "") { $part->setOffset((int) $indices[1]); } } } elseif ( ($word = $this->consumeWord()) && ($word === "even" || $word === "odd") ) { $part->setStep(2); if ($word === "odd") { $part->setOffset(1); } } else { $error = true; } $this->consumeWhitespaces(); if (!$error && !$this->consume(")")) { $error = true; } } } elseif ($argsType === 2) { $part = new Node\Part\PseudoSelector; if ( $this->consume("(") && ($selector = $this->parseSelector($name !== "has")) && $this->consume(")") ) { $part->setSelector($selector); } else { $error = true; } } else { $part = new Node\Part\PseudoSimple; } if ($error) { throw new Exception( "Invalid argument for pseudo selector '$name'" ); } $part->setName($name); return $part; } /** * Parses a literal value * * @return array|null * * @throws Exception */ protected function parseLiteral() { if ( ($literal = $this->parseLiteralBoolNull()) !== 0 || ($literal = $this->parseLiteralString()) !== null || ($literal = $this->parseLiteralNumber()) !== null ) { return array($literal, false); } elseif ($literal = $this->parseLiteralRegex()) { return array($literal, true); } return null; } /** * Parses a literal boolean or null value * * @return int|bool|null * * @throws Exception */ protected function parseLiteralBoolNull() { $word = $this->consumeWord(); if (!$word) { return 0; } elseif ($word === "true") { return true; } elseif ($word === "false") { return false; } elseif ($word === "null") { return null; } throw new Exception("Invalid attribute value '$word'"); } /** * Parses a literal string * * @return string|null * * @throws Exception */ protected function parseLiteralString() { if (!($quote = $this->consumeAny(array("'", '"'), true))) { return null; } if (($str = $this->consumeUntil($quote)) === null) { throw new Exception("Unterminated string in attribute value"); } return $str; } /** * Parses a literal number * * @return int|float|null */ protected function parseLiteralNumber() { if ( $this->getChar() === "0" && ($val = $this->consumeRegex("0[xX][a-fA-F]+|0[bB][01]+|0[oO][0-7]+")) ) { $form = strtolower($val[1]); $val = substr($val, 2); if ($form === "x") { return hexdec($val); } elseif ($form === "o") { return octdec($val); } return bindec($val); } $reg = "-?\d+(?:\.\d+)?(?:[eE][+-]?\d+)?|-?\.\d+(?:[eE][+-]?\d+)?"; if (!($val = $this->consumeRegex($reg))) { return null; } return (float) $val; } /** * Parses a literal regex * * @return string|null * * @throws Exception */ protected function parseLiteralRegex() { if (!($sep = $this->consume("/"))) { return null; } if (($reg = $this->consumeUntil($sep, false, true)) === null) { throw new Exception("Unterminated regex in attribute value"); } $modifiers = $this->consumeWord(); return $sep . $reg . ($modifiers ?: ""); } /** * Consumes the given regex * * @param string $regex Regex to consume * * @return mixed|null */ protected function consumeRegex($regex) { if ($this->getChar() === null) { return null; } if (!preg_match("#^($regex)#", substr($this->selector, $this->index), $matches)) { return null; } $this->index += strlen($matches[1]); return $matches[1]; } /** * Consumes all the characters until the given one is reached * * @param string $stop Stop character * @param bool $removeEscapes If false escape characters won't be removed * @param false $includeStop If true stop character will be returned * * @return string|null */ protected function consumeUntil($stop, $removeEscapes = true, $includeStop = false) { $buffer = ""; $escaped = false; while (($char = $this->getChar()) !== null) { $this->index += 1; if (!$escaped) { if ($char === "\\") { $escaped = true; if (!$removeEscapes) { $buffer .= $char; } continue; } elseif ($char === $stop) { if ($includeStop) { $buffer .= $char; } return $buffer; } } $buffer .= $char; $escaped = false; } return null; } /** * Consumes a word composed by characters a-z * * @param null|string $extraChar Extra character to match * * @return string */ protected function consumeWord($extraChar = null) { $buffer = ""; while ($char = $this->getChar()) { if ( ($char >= "a" && $char <= "z") || ($char >= "A" && $char <= "Z") || ($extraChar !== null && $char === $extraChar) ) { $buffer .= $char; $this->index += 1; } else { break; } } return $buffer; } /** * Consumes a combinator * * @return string|null */ protected function consumeCombinator() { //Initial ws can be trimmed if followed by another combinator $ws = $this->consumeWhitespaces(); if ($combinator = $this->consumeAny($this->combinators, true)) { $this->consumeWhitespaces(); } elseif ($ws) { //If there's no other combinators use the space $combinator = " "; } else { $combinator = null; } return $combinator; } /** * Consumes as much whitespaces as possible * * @return string */ protected function consumeWhitespaces() { return $this->consumeAny($this->whitespaces); } /** * Consumes the given characters * * @param array $chars Characters to consume * @param false $stopAtFirst If true only the first matching character * is consumed * * @return string */ protected function consumeAny($chars, $stopAtFirst = false) { $buffer = ""; while (($char = $this->getChar()) !== null) { if (in_array($char, $chars)) { $buffer .= $char; $this->index++; if ($stopAtFirst) { break; } } else { break; } } return $buffer; } /** * Consumes the current character if it is equal to the * given one * * @param string $char Character to compare * * @return string|null */ protected function consume($char) { if ($this->getChar() === $char) { $this->index++; return $char; } return null; } /** * Returns the current character or null if the end * have been reached * * @return string|null */ protected function getChar() { if ($this->index < $this->length) { return $this->selector[$this->index]; } return null; } }