', '+', '-', '*', '/', '!', '^', '%', '|', '&', '#', ]; /** * Stuff that only needs to be done once. Builds regular expressions and * sorts the reserved words. */ public function __construct() { // Sort reserved word list from longest word to shortest, 3x faster than usort $reservedMap = array_combine($this->reserved, array_map('strlen', $this->reserved)); assert($reservedMap !== false); arsort($reservedMap); $this->reserved = array_keys($reservedMap); // Set up regular expressions $this->regexBoundaries = '(' . implode( '|', $this->quoteRegex($this->boundaries) ) . ')'; $this->regexReserved = '(' . implode( '|', $this->quoteRegex($this->reserved) ) . ')'; $this->regexReservedToplevel = str_replace(' ', '\\s+', '(' . implode( '|', $this->quoteRegex($this->reservedToplevel) ) . ')'); $this->regexReservedNewline = str_replace(' ', '\\s+', '(' . implode( '|', $this->quoteRegex($this->reservedNewline) ) . ')'); $this->regexFunction = '(' . implode('|', $this->quoteRegex($this->functions)) . ')'; } /** * Takes a SQL string and breaks it into tokens. * Each token is an associative array with type and value. * * @param string $string The SQL string */ public function tokenize(string $string): Cursor { $tokens = []; // Used to make sure the string keeps shrinking on each iteration $oldStringLen = strlen($string) + 1; $token = null; $currentLength = strlen($string); // Keep processing the string until it is empty while ($currentLength) { // If the string stopped shrinking, there was a problem if ($oldStringLen <= $currentLength) { $tokens[] = new Token(Token::TOKEN_TYPE_ERROR, $string); return new Cursor($tokens); } $oldStringLen = $currentLength; // Get the next token and the token type $token = $this->createNextToken($string, $token); $tokenLength = strlen($token->value()); $tokens[] = $token; // Advance the string $string = substr($string, $tokenLength); $currentLength -= $tokenLength; } return new Cursor($tokens); } /** * Return the next token and token type in a SQL string. * Quoted strings, comments, reserved words, whitespace, and punctuation * are all their own tokens. * * @param string $string The SQL string * @param Token|null $previous The result of the previous createNextToken() call * * @return Token An associative array containing the type and value of the token. */ private function createNextToken(string $string, ?Token $previous = null): Token { $matches = []; // Whitespace if (preg_match('/^\s+/', $string, $matches)) { return new Token(Token::TOKEN_TYPE_WHITESPACE, $matches[0]); } // Comment if ( $string[0] === '#' || (isset($string[1]) && (($string[0] === '-' && $string[1] === '-') || ($string[0] === '/' && $string[1] === '*'))) ) { // Comment until end of line if ($string[0] === '-' || $string[0] === '#') { $last = strpos($string, "\n"); $type = Token::TOKEN_TYPE_COMMENT; } else { // Comment until closing comment tag $pos = strpos($string, '*/', 2); assert($pos !== false); $last = $pos + 2; $type = Token::TOKEN_TYPE_BLOCK_COMMENT; } if ($last === false) { $last = strlen($string); } return new Token($type, substr($string, 0, $last)); } // Quoted String if ($string[0] === '"' || $string[0] === '\'' || $string[0] === '`' || $string[0] === '[') { return new Token( ($string[0] === '`' || $string[0] === '[' ? Token::TOKEN_TYPE_BACKTICK_QUOTE : Token::TOKEN_TYPE_QUOTE), $this->getQuotedString($string) ); } // User-defined Variable if (($string[0] === '@' || $string[0] === ':') && isset($string[1])) { $value = null; $type = Token::TOKEN_TYPE_VARIABLE; // If the variable name is quoted if ($string[1] === '"' || $string[1] === '\'' || $string[1] === '`') { $value = $string[0] . $this->getQuotedString(substr($string, 1)); } else { // Non-quoted variable name preg_match('/^(' . $string[0] . '[a-zA-Z0-9\._\$]+)/', $string, $matches); if ($matches) { $value = $matches[1]; } } if ($value !== null) { return new Token($type, $value); } } // Number (decimal, binary, or hex) if ( preg_match( '/^([0-9]+(\.[0-9]+)?|0x[0-9a-fA-F]+|0b[01]+)($|\s|"\'`|' . $this->regexBoundaries . ')/', $string, $matches ) ) { return new Token(Token::TOKEN_TYPE_NUMBER, $matches[1]); } // Boundary Character (punctuation and symbols) if (preg_match('/^(' . $this->regexBoundaries . ')/', $string, $matches)) { return new Token(Token::TOKEN_TYPE_BOUNDARY, $matches[1]); } // A reserved word cannot be preceded by a '.' // this makes it so in "mytable.from", "from" is not considered a reserved word if (! $previous || $previous->value() !== '.') { $upper = strtoupper($string); // Top Level Reserved Word if ( preg_match( '/^(' . $this->regexReservedToplevel . ')($|\s|' . $this->regexBoundaries . ')/', $upper, $matches ) ) { return new Token( Token::TOKEN_TYPE_RESERVED_TOPLEVEL, substr($upper, 0, strlen($matches[1])) ); } // Newline Reserved Word if ( preg_match( '/^(' . $this->regexReservedNewline . ')($|\s|' . $this->regexBoundaries . ')/', $upper, $matches ) ) { return new Token( Token::TOKEN_TYPE_RESERVED_NEWLINE, substr($upper, 0, strlen($matches[1])) ); } // Other Reserved Word if ( preg_match( '/^(' . $this->regexReserved . ')($|\s|' . $this->regexBoundaries . ')/', $upper, $matches ) ) { return new Token( Token::TOKEN_TYPE_RESERVED, substr($upper, 0, strlen($matches[1])) ); } } // A function must be succeeded by '(' // this makes it so "count(" is considered a function, but "count" alone is not $upper = strtoupper($string); // function if (preg_match('/^(' . $this->regexFunction . '[(]|\s|[)])/', $upper, $matches)) { return new Token( Token::TOKEN_TYPE_RESERVED, substr($upper, 0, strlen($matches[1]) - 1) ); } // Non reserved word preg_match('/^(.*?)($|\s|["\'`]|' . $this->regexBoundaries . ')/', $string, $matches); return new Token(Token::TOKEN_TYPE_WORD, $matches[1]); } /** * Helper function for building regular expressions for reserved words and boundary characters * * @param string[] $strings The strings to be quoted * * @return string[] The quoted strings */ private function quoteRegex(array $strings): array { return array_map(static function (string $string): string { return preg_quote($string, '/'); }, $strings); } private function getQuotedString(string $string): string { $ret = ''; // This checks for the following patterns: // 1. backtick quoted string using `` to escape // 2. square bracket quoted string (SQL Server) using ]] to escape // 3. double quoted string using "" or \" to escape // 4. single quoted string using '' or \' to escape if ( preg_match( '/^(((`[^`]*($|`))+)| ((\[[^\]]*($|\]))(\][^\]]*($|\]))*)| (("[^"\\\\]*(?:\\\\.[^"\\\\]*)*("|$))+)| ((\'[^\'\\\\]*(?:\\\\.[^\'\\\\]*)*(\'|$))+))/sx', $string, $matches ) ) { $ret = $matches[1]; } return $ret; } }