/* Copyright (c) 2008-2009 Yahoo! Inc. All rights reserved. The copyrights embodied in the content of this file are licensed by Yahoo! Inc. under the BSD (revised) open source license @author Dan Vlad Dascalescu Parse function for PHP. Makes use of the tokenizer from tokenizephp.js. Based on parsejavascript.js by Marijn Haverbeke. Features: + special "deprecated" style for PHP4 keywords like 'var' + support for PHP 5.3 keywords: 'namespace', 'use' + 911 predefined constants, 1301 predefined functions, 105 predeclared classes from a typical PHP installation in a LAMP environment + new feature: syntax error flagging, thus enabling strict parsing of: + function definitions with explicitly or implicitly typed arguments and default values + modifiers (public, static etc.) applied to method and member definitions + foreach(array_expression as $key [=> $value]) loops + differentiation between single-quoted strings and double-quoted interpolating strings */ // add the Array.indexOf method for JS engines that don't support it (e.g. IE) // code from https://developer.mozilla.org/En/Core_JavaScript_1.5_Reference/Global_Objects/Array/IndexOf if (!Array.prototype.indexOf) { Array.prototype.indexOf = function(elt /*, from*/) { var len = this.length; var from = Number(arguments[1]) || 0; from = (from < 0) ? Math.ceil(from) : Math.floor(from); if (from < 0) from += len; for (; from < len; from++) { if (from in this && this[from] === elt) return from; } return -1; }; } var PHPParser = Editor.Parser = (function() { // Token types that can be considered to be atoms, part of operator expressions var atomicTypes = { "atom": true, "number": true, "variable": true, "string": true }; // Constructor for the lexical context objects. function PHPLexical(indented, column, type, align, prev, info) { // indentation at start of this line this.indented = indented; // column at which this scope was opened this.column = column; // type of scope ('stat' (statement), 'form' (special form), '[', '{', or '(') this.type = type; // '[', '{', or '(' blocks that have any text after their opening // character are said to be 'aligned' -- any lines below are // indented all the way to the opening character. if (align != null) this.align = align; // Parent scope, if any. this.prev = prev; this.info = info; } // PHP indentation rules function indentPHP(lexical) { return function(firstChars) { var firstChar = firstChars && firstChars.charAt(0), type = lexical.type; var closing = firstChar == type; if (type == "form" && firstChar == "{") return lexical.indented; else if (type == "stat" || type == "form") return lexical.indented + indentUnit; else if (lexical.info == "switch" && !closing) return lexical.indented + (/^(?:case|default)\b/.test(firstChars) ? indentUnit : 2 * indentUnit); else if (lexical.align) return lexical.column - (closing ? 1 : 0); else return lexical.indented + (closing ? 0 : indentUnit); }; } // The parser-iterator-producing function itself. function parsePHP(input, basecolumn) { // Wrap the input in a token stream var tokens = tokenizePHP(input); // The parser state. cc is a stack of actions that have to be // performed to finish the current statement. For example we might // know that we still need to find a closing parenthesis and a // semicolon. Actions at the end of the stack go first. It is // initialized with an infinitely looping action that consumes // whole statements. var cc = [statements]; // The lexical scope, used mostly for indentation. var lexical = new PHPLexical((basecolumn || 0) - indentUnit, 0, "block", false); // Current column, and the indentation at the start of the current // line. Used to create lexical scope objects. var column = 0; var indented = 0; // Variables which are used by the mark, cont, and pass functions // below to communicate with the driver loop in the 'next' function. var consume, marked; // The iterator object. var parser = {next: next, copy: copy}; // parsing is accomplished by calling next() repeatedly function next(){ // Start by performing any 'lexical' actions (adjusting the // lexical variable), or the operations below will be working // with the wrong lexical state. while(cc[cc.length - 1].lex) cc.pop()(); // Fetch the next token. var token = tokens.next(); // Adjust column and indented. if (token.type == "whitespace" && column == 0) indented = token.value.length; column += token.value.length; if (token.content == "\n"){ indented = column = 0; // If the lexical scope's align property is still undefined at // the end of the line, it is an un-aligned scope. if (!("align" in lexical)) lexical.align = false; // Newline tokens get an indentation function associated with // them. token.indentation = indentPHP(lexical); } // No more processing for meaningless tokens. if (token.type == "whitespace" || token.type == "comment" || token.type == "string_not_terminated" ) return token; // When a meaningful token is found and the lexical scope's // align is undefined, it is an aligned scope. if (!("align" in lexical)) lexical.align = true; // Execute actions until one 'consumes' the token and we can // return it. 'marked' is used to change the style of the current token. while(true) { consume = marked = false; // Take and execute the topmost action. var action = cc.pop(); action(token); if (consume){ if (marked) token.style = marked; // Here we differentiate between local and global variables. return token; } } return 1; // Firebug workaround for http://code.google.com/p/fbug/issues/detail?id=1239#c1 } // This makes a copy of the parser state. It stores all the // stateful variables in a closure, and returns a function that // will restore them when called with a new input stream. Note // that the cc array has to be copied, because it is contantly // being modified. Lexical objects are not mutated, so they can // be shared between runs of the parser. function copy(){ var _lexical = lexical, _cc = cc.concat([]), _tokenState = tokens.state; return function copyParser(input){ lexical = _lexical; cc = _cc.concat([]); // copies the array column = indented = 0; tokens = tokenizePHP(input, _tokenState); return parser; }; } // Helper function for pushing a number of actions onto the cc // stack in reverse order. function push(fs){ for (var i = fs.length - 1; i >= 0; i--) cc.push(fs[i]); } // cont and pass are used by the action functions to add other // actions to the stack. cont will cause the current token to be // consumed, pass will leave it for the next action. function cont(){ push(arguments); consume = true; } function pass(){ push(arguments); consume = false; } // Used to change the style of the current token. function mark(style){ marked = style; } // Add a lyer of style to the current token, for example syntax-error function mark_add(style){ marked = marked + ' ' + style; } // Push a new lexical context of the given type. function pushlex(type, info) { var result = function pushlexing() { lexical = new PHPLexical(indented, column, type, null, lexical, info) }; result.lex = true; return result; } // Pop off the current lexical context. function poplex(){ lexical = lexical.prev; } poplex.lex = true; // The 'lex' flag on these actions is used by the 'next' function // to know they can (and have to) be ran before moving on to the // next token. // Creates an action that discards tokens until it finds one of // the given type. This will ignore (and recover from) syntax errors. function expect(wanted){ return function expecting(token){ if (token.type == wanted) cont(); // consume the token else { cont(arguments.callee); // continue expecting() - call itself } }; } // Require a specific token type, or one of the tokens passed in the 'wanted' array // Used to detect blatant syntax errors. 'execute' is used to pass extra code // to be executed if the token is matched. For example, a '(' match could // 'execute' a cont( compasep(funcarg), require(")") ) function require(wanted, execute){ return function requiring(token){ var ok; var type = token.type; if (typeof(wanted) == "string") ok = (type == wanted) -1; else ok = wanted.indexOf(type); if (ok >= 0) { if (execute && typeof(execute[ok]) == "function") execute[ok](token); cont(); // just consume the token } else { if (!marked) mark(token.style); mark_add("syntax-error"); cont(arguments.callee); } }; } // Looks for a statement, and then calls itself. function statements(token){ return pass(statement, statements); } // Dispatches various types of statements based on the type of the current token. function statement(token){ var type = token.type; if (type == "keyword a") cont(pushlex("form"), expression, statement, poplex); else if (type == "keyword b") cont(pushlex("form"), statement, poplex); else if (type == "{") cont(pushlex("}"), block, poplex); else if (type == "function") funcdef(); // technically, "class implode {...}" is correct, but we'll flag that as an error because it overrides a predefined function else if (type == "class") cont(require("t_string"), expect("{"), pushlex("}"), block, poplex); else if (type == "foreach") cont(pushlex("form"), require("("), pushlex(")"), expression, require("as"), require("variable"), /* => $value */ expect(")"), poplex, statement, poplex); else if (type == "for") cont(pushlex("form"), require("("), pushlex(")"), expression, require(";"), expression, require(";"), expression, require(")"), poplex, statement, poplex); // public final function foo(), protected static $bar; else if (type == "modifier") cont(require(["modifier", "variable", "function"], [null, null, funcdef])); else if (type == "switch") cont(pushlex("form"), require("("), expression, require(")"), pushlex("}", "switch"), require([":", "{"]), block, poplex, poplex); else if (type == "case") cont(expression, require(":")); else if (type == "default") cont(require(":")); else if (type == "catch") cont(pushlex("form"), require("("), require("t_string"), require("variable"), require(")"), statement, poplex); else if (type == "const") cont(require("t_string")); // 'const static x=5' is a syntax error // technically, "namespace implode {...}" is correct, but we'll flag that as an error because it overrides a predefined function else if (type == "namespace") cont(namespacedef, require(";")); // $variables may be followed by operators, () for variable function calls, or [] subscripts else pass(pushlex("stat"), expression, require(";"), poplex); } // Dispatch expression types. function expression(token){ var type = token.type; if (atomicTypes.hasOwnProperty(type)) cont(maybeoperator); else if (type == "<<<") cont(require("string"), maybeoperator); // heredoc/nowdoc else if (type == "t_string") cont(maybe_double_colon, maybeoperator); else if (type == "keyword c") cont(expression); // function call or parenthesized expression: $a = ($b + 1) * 2; else if (type == "(") cont(pushlex(")"), commasep(expression), require(")"), poplex, maybeoperator); else if (type == "operator") cont(expression); } // Called for places where operators, function calls, or subscripts are // valid. Will skip on to the next action if none is found. function maybeoperator(token){ var type = token.type; if (type == "operator") { if (token.content == "?") cont(expression, require(":"), expression); // ternary operator else cont(expression); } else if (type == "(") cont(pushlex(")"), expression, commasep(expression), require(")"), poplex, maybeoperator /* $varfunc() + 3 */); else if (type == "[") cont(pushlex("]"), expression, require("]"), maybeoperator /* for multidimensional arrays, or $func[$i]() */, poplex); } // A regular use of the double colon to specify a class, as in self::func() or myclass::$var; // Differs from `namespace` or `use` in that only one class can be the parent; chains (A::B::$var) are a syntax error. function maybe_double_colon(token) { if (token.type == "t_double_colon") // A::$var, A::func(), A::const cont(require(["t_string", "variable"]), maybeoperator); else { // a t_string wasn't followed by ::, such as in a function call: foo() pass(expression) } } // the declaration or definition of a function function funcdef() { cont(require("t_string"), require("("), pushlex(")"), commasep(funcarg), require(")"), poplex, block); } // Parses a comma-separated list of the things that are recognized // by the 'what' argument. function commasep(what){ function proceed(token) { if (token.type == ",") cont(what, proceed); } return function commaSeparated() { pass(what, proceed); }; } // Look for statements until a closing brace is found. function block(token) { if (token.type == "}") cont(); else pass(statement, block); } function maybedefaultparameter(token){ if (token.content == "=") cont(expression); } // support for default arguments: http://us.php.net/manual/en/functions.arguments.php#functions.arguments.default function funcarg(token){ // function foo(myclass $obj) {...} if (token.type == "t_string") cont(require("variable"), maybedefaultparameter); // function foo($string) {...} else if (token.type == "variable") cont(maybedefaultparameter); } // A namespace definition or use function maybe_double_colon_def(token) { if (token.type == "t_double_colon") cont(namespacedef); } function namespacedef(token) { pass(require("t_string"), maybe_double_colon_def); } return parser; } return {make: parsePHP, electricChars: "{}:"}; })();