e8c06b0312
Fixed a few issues in the lexer for heredocs and nowdocs. The source file is read in 64k chunks, and any time a doc was split across a buffer boundary the lexer would fail to consume the doc properly. Modified the lexer to refill the file buffer when it is used, or when more data is needed in a variety of cases. Also fixed a number of other corner cases where we'd fail to recognize the doc end label or other special characters. The old code was also a bit over- and under-flowy.
600 linhas
16 KiB
C++
600 linhas
16 KiB
C++
/*
|
|
+----------------------------------------------------------------------+
|
|
| HipHop for PHP |
|
|
+----------------------------------------------------------------------+
|
|
| Copyright (c) 2010- Facebook, Inc. (http://www.facebook.com) |
|
|
+----------------------------------------------------------------------+
|
|
| This source file is subject to version 3.01 of the PHP license, |
|
|
| that is bundled with this package in the file LICENSE, and is |
|
|
| available through the world-wide-web at the following url: |
|
|
| http://www.php.net/license/3_01.txt |
|
|
| If you did not receive a copy of the PHP license and are unable to |
|
|
| obtain it through the world-wide-web, please send a note to |
|
|
| license@php.net so we can mail you a copy immediately. |
|
|
+----------------------------------------------------------------------+
|
|
*/
|
|
|
|
#include "util/parser/scanner.h"
|
|
#include "util/util.h"
|
|
#include "util/logger.h"
|
|
#include "util/zend/zend_string.h"
|
|
#include "util/zend/zend_html.h"
|
|
|
|
namespace HPHP {
|
|
///////////////////////////////////////////////////////////////////////////////
|
|
|
|
void ScannerToken::xhpLabel(bool prefix /* = true */) {
|
|
Util::replaceAll(m_text, ":", "__");
|
|
Util::replaceAll(m_text, "-", "_");
|
|
if (prefix) {
|
|
m_text = "xhp_" + m_text;
|
|
}
|
|
}
|
|
|
|
bool ScannerToken::htmlTrim() {
|
|
assert(!m_text.empty());
|
|
|
|
const char *p0 = m_text.c_str();
|
|
const char *p1 = m_text.c_str() + m_text.size() - 1;
|
|
const char *p00 = p0;
|
|
const char *p10 = p1;
|
|
while (isspace(*p0) && p0 <= p10) ++p0;
|
|
if (p0 > p10) {
|
|
m_text.clear();
|
|
return false;
|
|
}
|
|
while (isspace(*p1) && p1 > p0) --p1;
|
|
string text;
|
|
text.reserve(m_text.length());
|
|
if (p0 != p00) {
|
|
text = " ";
|
|
}
|
|
for (const char *p = p0; p <= p1; ++p) {
|
|
if (!isspace(*p)) {
|
|
text += *p;
|
|
} else {
|
|
while (isspace(*p)) ++p;
|
|
text += ' ';
|
|
text += *p;
|
|
}
|
|
}
|
|
if (p1 != p10) {
|
|
text += " ";
|
|
}
|
|
m_text = text;
|
|
return true;
|
|
}
|
|
|
|
void ScannerToken::xhpDecode() {
|
|
int len = m_text.size();
|
|
// note: 5th arg is charset_hint string; here we pass nullptr to indicate
|
|
// "use the default one" which is UTF-8. (Just saves a charset lookup.)
|
|
char *ret = string_html_decode(m_text.c_str(), len, true,
|
|
false, nullptr, true, true);
|
|
// safety check: decode function returns null iff charset unrecognized;
|
|
// i.e. nullptr result would mean UTF-8 is available.
|
|
// Pretty sure it is universally available!
|
|
// (Do assertion anyway.)
|
|
assert(ret);
|
|
m_text = string(ret, len);
|
|
free(ret);
|
|
}
|
|
|
|
///////////////////////////////////////////////////////////////////////////////
|
|
|
|
Scanner::Scanner(const char *filename, int type, bool md5 /* = false */)
|
|
: m_filename(filename), m_stream(nullptr), m_source(nullptr), m_len(0), m_pos(0),
|
|
m_state(Start), m_type(type), m_yyscanner(nullptr), m_token(nullptr),
|
|
m_loc(nullptr), m_lastToken(-1), m_isStrictMode(0), m_lookaheadLtDepth(0) {
|
|
m_stream = new std::ifstream(filename);
|
|
m_streamOwner = true;
|
|
if (m_stream->fail()) {
|
|
delete m_stream; m_stream = nullptr;
|
|
throw FileOpenException(filename);
|
|
}
|
|
if (md5) computeMd5();
|
|
init();
|
|
}
|
|
|
|
Scanner::Scanner(std::istream &stream, int type,
|
|
const char *fileName /* = "" */,
|
|
bool md5 /* = false */)
|
|
: m_filename(fileName), m_source(nullptr), m_len(0), m_pos(0),
|
|
m_state(Start), m_type(type), m_yyscanner(nullptr), m_token(nullptr),
|
|
m_loc(nullptr), m_lastToken(-1), m_isStrictMode(0), m_lookaheadLtDepth(0) {
|
|
m_stream = &stream;
|
|
m_streamOwner = false;
|
|
if (md5) computeMd5();
|
|
init();
|
|
}
|
|
|
|
Scanner::Scanner(const char *source, int len, int type,
|
|
const char *fileName /* = "" */, bool md5 /* = false */)
|
|
: m_filename(fileName), m_stream(nullptr), m_source(source), m_len(len),
|
|
m_pos(0), m_state(Start), m_type(type), m_yyscanner(nullptr),
|
|
m_token(nullptr), m_loc(nullptr), m_lastToken(-1), m_isStrictMode(0),
|
|
m_lookaheadLtDepth(0) {
|
|
assert(m_source);
|
|
m_streamOwner = false;
|
|
if (md5) {
|
|
m_stream = new std::istringstream(string(source, len));
|
|
m_streamOwner = true;
|
|
computeMd5();
|
|
}
|
|
|
|
init();
|
|
}
|
|
|
|
void Scanner::computeMd5() {
|
|
int startpos = m_stream->tellg();
|
|
m_stream->seekg(0, std::ios::end);
|
|
int length = m_stream->tellg();
|
|
m_stream->seekg(0, std::ios::beg);
|
|
char *ptr = (char*)malloc(length);
|
|
m_stream->read(ptr, length);
|
|
m_stream->seekg(startpos, std::ios::beg);
|
|
int out_len;
|
|
char *md5str = string_md5(ptr, length, false, out_len);
|
|
free(ptr);
|
|
m_md5 = string(md5str, out_len);
|
|
free(md5str);
|
|
}
|
|
|
|
Scanner::~Scanner() {
|
|
reset();
|
|
if (m_streamOwner) {
|
|
delete m_stream;
|
|
}
|
|
}
|
|
|
|
void Scanner::setHashBang(const char *rawText, int rawLeng) {
|
|
if (m_type & ReturnAllTokens) {
|
|
setToken(rawText, rawLeng);
|
|
} else {
|
|
m_token->setText("", 0);
|
|
incLoc(rawText, rawLeng);
|
|
}
|
|
}
|
|
|
|
// scanToken() will always get a new token from the frontier
|
|
// regardless of whether there are tokens in the lookahead store
|
|
int Scanner::scanToken(ScannerToken &t, Location &l) {
|
|
m_token = &t;
|
|
m_loc = &l;
|
|
int tokid;
|
|
for (;;) {
|
|
tokid = scan();
|
|
switch (tokid) {
|
|
case T_DOC_COMMENT:
|
|
setDocComment(m_token->text());
|
|
/* fall through */
|
|
case T_COMMENT:
|
|
case T_OPEN_TAG:
|
|
case T_WHITESPACE:
|
|
if (m_type & ReturnAllTokens) {
|
|
// m_lastToken holds the last "signficant" token, so
|
|
// don't update it for comments or whitespace
|
|
return tokid;
|
|
}
|
|
break;
|
|
default:
|
|
m_lastToken = tokid;
|
|
return tokid;
|
|
}
|
|
}
|
|
}
|
|
|
|
// fetchToken() will return the first token in the lookahead store (if the
|
|
// lookahead store has tokens) or it will get a new token from the frontier
|
|
int Scanner::fetchToken(ScannerToken &t, Location &l) {
|
|
m_token = &t;
|
|
m_loc = &l;
|
|
int tokid;
|
|
if (!m_lookahead.empty()) {
|
|
// If there is a lookahead token, return that. No need to perform
|
|
// special logic for "ReturnAllTokens", we already accounted for
|
|
// that when the tokens were inserted into m_lookahead
|
|
TokenStore::iterator it = m_lookahead.begin();
|
|
tokid = it->t;
|
|
*m_token = it->token;
|
|
*m_loc = it->loc;
|
|
return tokid;
|
|
}
|
|
return scanToken(t,l);
|
|
}
|
|
|
|
// nextLookahead() advances an iterator forward in the lookahead store.
|
|
// If the end of the store is reached, a new token will be scanned from
|
|
// the frontier. nextLookahead skips over whitespace and comments.
|
|
void Scanner::nextLookahead(TokenStore::iterator& pos) {
|
|
for (;;) {
|
|
++pos;
|
|
if (pos == m_lookahead.end()) {
|
|
pos = m_lookahead.appendNew();
|
|
pos->loc = *m_loc;
|
|
pos->t = scanToken(pos->token, pos->loc);
|
|
}
|
|
switch (pos->t) {
|
|
case T_DOC_COMMENT:
|
|
case T_COMMENT:
|
|
case T_OPEN_TAG:
|
|
case T_WHITESPACE:
|
|
break;
|
|
default:
|
|
return;
|
|
}
|
|
}
|
|
}
|
|
|
|
bool Scanner::tryParseTypeList(TokenStore::iterator& pos) {
|
|
for (;;) {
|
|
if (!tryParseNSType(pos)) return false;
|
|
if (pos->t == T_AS) {
|
|
nextLookahead(pos);
|
|
if (!tryParseNSType(pos)) return false;
|
|
}
|
|
if (pos->t != ',') return true;
|
|
nextLookahead(pos);
|
|
}
|
|
}
|
|
|
|
bool Scanner::tryParseFuncTypeList(TokenStore::iterator& pos) {
|
|
for (;;) {
|
|
if (pos->t == T_VARARG) {
|
|
nextLookahead(pos);
|
|
return true;
|
|
}
|
|
if (!tryParseNSType(pos)) return false;
|
|
if (pos->t != ',') return true;
|
|
nextLookahead(pos);
|
|
}
|
|
}
|
|
|
|
bool
|
|
Scanner::tryParseNSType(TokenStore::iterator& pos) {
|
|
if (pos->t == '@') {
|
|
nextLookahead(pos);
|
|
}
|
|
if (pos->t == '?') {
|
|
nextLookahead(pos);
|
|
}
|
|
if (pos->t == '(') {
|
|
nextLookahead(pos);
|
|
if (pos->t == T_FUNCTION) {
|
|
nextLookahead(pos);
|
|
if (pos->t != '(') return false;
|
|
nextLookahead(pos);
|
|
if (pos->t != ')') {
|
|
if (!tryParseFuncTypeList(pos)) return false;
|
|
if (pos->t != ')') return false;
|
|
}
|
|
nextLookahead(pos);
|
|
if (pos->t == ')') {
|
|
nextLookahead(pos);
|
|
return true;
|
|
}
|
|
if (pos->t != ':') return false;
|
|
nextLookahead(pos);
|
|
if (!tryParseNSType(pos)) return false;
|
|
if (pos->t != ')') return false;
|
|
nextLookahead(pos);
|
|
return true;
|
|
}
|
|
if (!tryParseTypeList(pos)) return false;
|
|
if (pos->t != ')') return false;
|
|
nextLookahead(pos);
|
|
return true;
|
|
}
|
|
if (pos->t == T_NAMESPACE) {
|
|
nextLookahead(pos);
|
|
if (pos->t != T_NS_SEPARATOR) return false;
|
|
nextLookahead(pos);
|
|
} else if (pos->t == T_NS_SEPARATOR) {
|
|
nextLookahead(pos);
|
|
}
|
|
for (;;) {
|
|
switch (pos->t) {
|
|
case T_STRING:
|
|
case T_XHP_ATTRIBUTE:
|
|
case T_XHP_CATEGORY:
|
|
case T_XHP_CHILDREN:
|
|
case T_XHP_REQUIRED:
|
|
case T_XHP_ENUM:
|
|
case T_ARRAY:
|
|
nextLookahead(pos);
|
|
break;
|
|
case T_XHP_LABEL:
|
|
nextLookahead(pos);
|
|
return true;
|
|
default:
|
|
return false;
|
|
}
|
|
if (pos->t == T_UNRESOLVED_LT) {
|
|
TokenStore::iterator ltPos = pos;
|
|
nextLookahead(pos);
|
|
++m_lookaheadLtDepth;
|
|
bool isTypeList = tryParseTypeList(pos);
|
|
--m_lookaheadLtDepth;
|
|
if (!isTypeList || pos->t != '>') {
|
|
ltPos->t = '<';
|
|
return false;
|
|
}
|
|
ltPos->t = T_TYPELIST_LT;
|
|
pos->t = T_TYPELIST_GT;
|
|
nextLookahead(pos);
|
|
return true;
|
|
}
|
|
if (pos->t != T_NS_SEPARATOR) {
|
|
return true;
|
|
}
|
|
nextLookahead(pos);
|
|
}
|
|
}
|
|
|
|
int Scanner::getNextToken(ScannerToken &t, Location &l) {
|
|
int tokid;
|
|
bool la = !m_lookahead.empty();
|
|
tokid = fetchToken(t, l);
|
|
if (LIKELY(tokid != T_UNRESOLVED_LT)) {
|
|
// In the common case, we don't have to perform any resolution
|
|
// and we can just return the token
|
|
if (UNLIKELY(la)) {
|
|
// If we pulled a lookahead token, we need to remove it from
|
|
// the lookahead store
|
|
m_lookahead.popFront();
|
|
}
|
|
return tokid;
|
|
}
|
|
// We encountered a '<' character that needs to be resolved.
|
|
if (!la) {
|
|
// If this token didn't come from the lookahead store, we
|
|
// need to stash it there
|
|
TokenStore::iterator it = m_lookahead.appendNew();
|
|
LookaheadToken ltd = { t, l, tokid };
|
|
*it = ltd;
|
|
}
|
|
// Look at subsequent tokens to determine if the '<' character
|
|
// is the start of a type list
|
|
TokenStore::iterator pos = m_lookahead.begin();
|
|
TokenStore::iterator ltPos = pos;
|
|
nextLookahead(pos);
|
|
++m_lookaheadLtDepth;
|
|
bool isTypeList = tryParseTypeList(pos);
|
|
--m_lookaheadLtDepth;
|
|
if (!isTypeList || pos->t != '>') {
|
|
ltPos->t = '<';
|
|
} else {
|
|
ltPos->t = T_TYPELIST_LT;
|
|
pos->t = T_TYPELIST_GT;
|
|
}
|
|
tokid = fetchToken(t, l);
|
|
// We pulled a lookahead token, we need to remove it from the
|
|
// lookahead store
|
|
m_lookahead.popFront();
|
|
return tokid;
|
|
}
|
|
|
|
int Scanner::read(char *text, int &result, int max) {
|
|
if (m_stream) {
|
|
if (!m_stream->eof()) {
|
|
m_stream->read(text, max);
|
|
if (!m_stream->bad()) {
|
|
return (result = m_stream->gcount());
|
|
}
|
|
}
|
|
} else if (m_source) {
|
|
if (m_pos < m_len) {
|
|
int count = m_len - m_pos;
|
|
if (count > max) count = max;
|
|
if (count > 0) {
|
|
memcpy(text, m_source + m_pos, count);
|
|
m_pos += count;
|
|
return (result = count);
|
|
}
|
|
}
|
|
}
|
|
return (result = 0);
|
|
}
|
|
|
|
void Scanner::error(const char* fmt, ...) {
|
|
va_list ap;
|
|
va_start(ap, fmt);
|
|
Util::string_vsnprintf(m_error, fmt, ap);
|
|
va_end(ap);
|
|
}
|
|
|
|
void Scanner::warn(const char* fmt, ...) {
|
|
va_list ap;
|
|
va_start(ap, fmt);
|
|
string msg;
|
|
Util::string_vsnprintf(msg, fmt, ap);
|
|
va_end(ap);
|
|
|
|
Logger::Warning("%s: %s (Line: %d, Char %d)", msg.c_str(),
|
|
m_filename.c_str(), m_loc->line0, m_loc->char0);
|
|
}
|
|
|
|
void Scanner::incLoc(const char *rawText, int rawLeng) {
|
|
assert(rawText);
|
|
assert(rawLeng > 0);
|
|
|
|
switch (m_state) {
|
|
case Start:
|
|
break; // scanner set to (1, 1, 1, 1) already
|
|
case NoLineFeed:
|
|
m_loc->line0 = m_loc->line1;
|
|
m_loc->char0 = m_loc->char1 + 1;
|
|
break;
|
|
case HadLineFeed:
|
|
m_loc->line0 = m_loc->line1 + 1;
|
|
m_loc->char0 = 1;
|
|
break;
|
|
}
|
|
const char *p = rawText;
|
|
for (int i = 0; i < rawLeng; i++) {
|
|
switch (m_state) {
|
|
case Start:
|
|
break; // scanner set to (1, 1, 1, 1) already
|
|
case NoLineFeed:
|
|
m_loc->char1++;
|
|
break;
|
|
case HadLineFeed:
|
|
m_loc->line1++;
|
|
m_loc->char1 = 1;
|
|
break;
|
|
}
|
|
m_state = (*p++ == '\n' ? HadLineFeed : NoLineFeed);
|
|
}
|
|
}
|
|
|
|
string Scanner::escape(const char *str, int len, char quote_type) const {
|
|
string output;
|
|
output.reserve(len);
|
|
|
|
if (quote_type == '\'') {
|
|
for (int i = 0; i < len; i++) {
|
|
unsigned char ch = str[i];
|
|
if (ch == '\\') {
|
|
if (++i < len) {
|
|
switch (str[i]) {
|
|
case '\\': output += "\\"; break;
|
|
case '\'': output += '\''; break;
|
|
default: {
|
|
output += ch;
|
|
output += str[i];
|
|
break;
|
|
}
|
|
}
|
|
} else {
|
|
assert(false);
|
|
output += ch;
|
|
}
|
|
} else {
|
|
output += ch;
|
|
}
|
|
}
|
|
} else {
|
|
for (int i = 0; i < len; i++) {
|
|
unsigned char ch = str[i];
|
|
if (ch == '\\') {
|
|
if (++i < len) {
|
|
switch (str[i]) {
|
|
case 'n': output += '\n'; break;
|
|
case 't': output += '\t'; break;
|
|
case 'r': output += '\r'; break;
|
|
case 'v': output += '\v'; break;
|
|
case 'f': output += '\f'; break;
|
|
case '\\': output += '\\'; break;
|
|
case '$': output += '$'; break;
|
|
case '"':
|
|
if (str[i] != quote_type) {
|
|
output += '\\';
|
|
}
|
|
output += '"';
|
|
break;
|
|
case 'x':
|
|
case 'X': {
|
|
if (isxdigit(str[i+1])) {
|
|
string shex;
|
|
shex += str[++i]; // 0th hex digit
|
|
if (isxdigit(str[i+1])) {
|
|
shex += str[++i]; // 1st hex digit
|
|
}
|
|
output += strtol(shex.c_str(), nullptr, 16);
|
|
} else {
|
|
output += ch;
|
|
output += str[i];
|
|
}
|
|
break;
|
|
}
|
|
default: {
|
|
// check for an octal
|
|
if ('0' <= str[i] && str[i] <= '7') {
|
|
string soct;
|
|
soct += str[i]; // 0th octal digit
|
|
if ('0' <= str[i+1] && str[i+1] <= '7') {
|
|
soct += str[++i]; // 1st octal digit
|
|
if ('0' <= str[i+1] && str[i+1] <= '7') {
|
|
soct += str[++i]; // 2nd octal digit
|
|
}
|
|
}
|
|
output += strtol(soct.c_str(), nullptr, 8);
|
|
} else {
|
|
output += ch;
|
|
output += str[i];
|
|
}
|
|
break;
|
|
}
|
|
}
|
|
} else {
|
|
output += ch;
|
|
}
|
|
} else {
|
|
output += ch;
|
|
}
|
|
}
|
|
}
|
|
return output;
|
|
}
|
|
|
|
TokenStore::iterator TokenStore::begin() {
|
|
if (empty()) {
|
|
return end();
|
|
}
|
|
iterator it;
|
|
it.m_slab = m_head;
|
|
it.m_pos = m_head->m_beginPos;
|
|
return it;
|
|
}
|
|
|
|
TokenStore::iterator TokenStore::end() {
|
|
iterator it;
|
|
it.m_slab = nullptr;
|
|
it.m_pos = 0;
|
|
return it;
|
|
}
|
|
|
|
void TokenStore::popFront() {
|
|
if (empty()) return;
|
|
++m_head->m_beginPos;
|
|
if (m_head->m_beginPos < m_head->m_endPos) return;
|
|
LookaheadSlab* nextSlab = m_head->m_next;
|
|
if (!nextSlab) {
|
|
// We just removed the last token from the last slab. We hang on to the
|
|
// last slab instead of freeing it so that we don't keep allocating and
|
|
// freeing slabs in the common steady state.
|
|
m_head->m_beginPos = 0;
|
|
m_head->m_endPos = 0;
|
|
return;
|
|
}
|
|
delete m_head;
|
|
m_head = nextSlab;
|
|
}
|
|
|
|
TokenStore::iterator TokenStore::appendNew() {
|
|
iterator it;
|
|
if (m_tail && m_tail->m_endPos < LookaheadSlab::SlabSize) {
|
|
it.m_slab = m_tail;
|
|
it.m_pos = m_tail->m_endPos;
|
|
++m_tail->m_endPos;
|
|
return it;
|
|
}
|
|
LookaheadSlab* newSlab = new LookaheadSlab;
|
|
newSlab->m_next = nullptr;
|
|
newSlab->m_beginPos = 0;
|
|
newSlab->m_endPos = 0;
|
|
if (m_tail) {
|
|
m_tail->m_next = newSlab;
|
|
m_tail = m_tail->m_next;
|
|
} else {
|
|
m_head = m_tail = newSlab;
|
|
}
|
|
it.m_slab = m_tail;
|
|
it.m_pos = newSlab->m_endPos;
|
|
++newSlab->m_endPos;
|
|
return it;
|
|
}
|
|
|
|
///////////////////////////////////////////////////////////////////////////////
|
|
}
|