Arquivos
hhvm/hphp/util/parser/scanner.cpp
T
mikemag e8c06b0312 Fix heredoc/nowdoc bugs with large docs, docs crossing buffer boundaries
Fixed a few issues in the lexer for heredocs and nowdocs. The source file is read in 64k chunks, and any time a doc was split across a buffer boundary the lexer would fail to consume the doc properly. Modified the lexer to refill the file buffer when it is used, or when more data is needed in a variety of cases. Also fixed a number of other corner cases where we'd fail to recognize the doc end label or other special characters. The old code was also a bit over- and under-flowy.
2013-04-02 15:00:45 -07:00

600 linhas
16 KiB
C++

/*
+----------------------------------------------------------------------+
| HipHop for PHP |
+----------------------------------------------------------------------+
| Copyright (c) 2010- Facebook, Inc. (http://www.facebook.com) |
+----------------------------------------------------------------------+
| This source file is subject to version 3.01 of the PHP license, |
| that is bundled with this package in the file LICENSE, and is |
| available through the world-wide-web at the following url: |
| http://www.php.net/license/3_01.txt |
| If you did not receive a copy of the PHP license and are unable to |
| obtain it through the world-wide-web, please send a note to |
| license@php.net so we can mail you a copy immediately. |
+----------------------------------------------------------------------+
*/
#include "util/parser/scanner.h"
#include "util/util.h"
#include "util/logger.h"
#include "util/zend/zend_string.h"
#include "util/zend/zend_html.h"
namespace HPHP {
///////////////////////////////////////////////////////////////////////////////
void ScannerToken::xhpLabel(bool prefix /* = true */) {
Util::replaceAll(m_text, ":", "__");
Util::replaceAll(m_text, "-", "_");
if (prefix) {
m_text = "xhp_" + m_text;
}
}
bool ScannerToken::htmlTrim() {
assert(!m_text.empty());
const char *p0 = m_text.c_str();
const char *p1 = m_text.c_str() + m_text.size() - 1;
const char *p00 = p0;
const char *p10 = p1;
while (isspace(*p0) && p0 <= p10) ++p0;
if (p0 > p10) {
m_text.clear();
return false;
}
while (isspace(*p1) && p1 > p0) --p1;
string text;
text.reserve(m_text.length());
if (p0 != p00) {
text = " ";
}
for (const char *p = p0; p <= p1; ++p) {
if (!isspace(*p)) {
text += *p;
} else {
while (isspace(*p)) ++p;
text += ' ';
text += *p;
}
}
if (p1 != p10) {
text += " ";
}
m_text = text;
return true;
}
void ScannerToken::xhpDecode() {
int len = m_text.size();
// note: 5th arg is charset_hint string; here we pass nullptr to indicate
// "use the default one" which is UTF-8. (Just saves a charset lookup.)
char *ret = string_html_decode(m_text.c_str(), len, true,
false, nullptr, true, true);
// safety check: decode function returns null iff charset unrecognized;
// i.e. nullptr result would mean UTF-8 is available.
// Pretty sure it is universally available!
// (Do assertion anyway.)
assert(ret);
m_text = string(ret, len);
free(ret);
}
///////////////////////////////////////////////////////////////////////////////
Scanner::Scanner(const char *filename, int type, bool md5 /* = false */)
: m_filename(filename), m_stream(nullptr), m_source(nullptr), m_len(0), m_pos(0),
m_state(Start), m_type(type), m_yyscanner(nullptr), m_token(nullptr),
m_loc(nullptr), m_lastToken(-1), m_isStrictMode(0), m_lookaheadLtDepth(0) {
m_stream = new std::ifstream(filename);
m_streamOwner = true;
if (m_stream->fail()) {
delete m_stream; m_stream = nullptr;
throw FileOpenException(filename);
}
if (md5) computeMd5();
init();
}
Scanner::Scanner(std::istream &stream, int type,
const char *fileName /* = "" */,
bool md5 /* = false */)
: m_filename(fileName), m_source(nullptr), m_len(0), m_pos(0),
m_state(Start), m_type(type), m_yyscanner(nullptr), m_token(nullptr),
m_loc(nullptr), m_lastToken(-1), m_isStrictMode(0), m_lookaheadLtDepth(0) {
m_stream = &stream;
m_streamOwner = false;
if (md5) computeMd5();
init();
}
Scanner::Scanner(const char *source, int len, int type,
const char *fileName /* = "" */, bool md5 /* = false */)
: m_filename(fileName), m_stream(nullptr), m_source(source), m_len(len),
m_pos(0), m_state(Start), m_type(type), m_yyscanner(nullptr),
m_token(nullptr), m_loc(nullptr), m_lastToken(-1), m_isStrictMode(0),
m_lookaheadLtDepth(0) {
assert(m_source);
m_streamOwner = false;
if (md5) {
m_stream = new std::istringstream(string(source, len));
m_streamOwner = true;
computeMd5();
}
init();
}
void Scanner::computeMd5() {
int startpos = m_stream->tellg();
m_stream->seekg(0, std::ios::end);
int length = m_stream->tellg();
m_stream->seekg(0, std::ios::beg);
char *ptr = (char*)malloc(length);
m_stream->read(ptr, length);
m_stream->seekg(startpos, std::ios::beg);
int out_len;
char *md5str = string_md5(ptr, length, false, out_len);
free(ptr);
m_md5 = string(md5str, out_len);
free(md5str);
}
Scanner::~Scanner() {
reset();
if (m_streamOwner) {
delete m_stream;
}
}
void Scanner::setHashBang(const char *rawText, int rawLeng) {
if (m_type & ReturnAllTokens) {
setToken(rawText, rawLeng);
} else {
m_token->setText("", 0);
incLoc(rawText, rawLeng);
}
}
// scanToken() will always get a new token from the frontier
// regardless of whether there are tokens in the lookahead store
int Scanner::scanToken(ScannerToken &t, Location &l) {
m_token = &t;
m_loc = &l;
int tokid;
for (;;) {
tokid = scan();
switch (tokid) {
case T_DOC_COMMENT:
setDocComment(m_token->text());
/* fall through */
case T_COMMENT:
case T_OPEN_TAG:
case T_WHITESPACE:
if (m_type & ReturnAllTokens) {
// m_lastToken holds the last "signficant" token, so
// don't update it for comments or whitespace
return tokid;
}
break;
default:
m_lastToken = tokid;
return tokid;
}
}
}
// fetchToken() will return the first token in the lookahead store (if the
// lookahead store has tokens) or it will get a new token from the frontier
int Scanner::fetchToken(ScannerToken &t, Location &l) {
m_token = &t;
m_loc = &l;
int tokid;
if (!m_lookahead.empty()) {
// If there is a lookahead token, return that. No need to perform
// special logic for "ReturnAllTokens", we already accounted for
// that when the tokens were inserted into m_lookahead
TokenStore::iterator it = m_lookahead.begin();
tokid = it->t;
*m_token = it->token;
*m_loc = it->loc;
return tokid;
}
return scanToken(t,l);
}
// nextLookahead() advances an iterator forward in the lookahead store.
// If the end of the store is reached, a new token will be scanned from
// the frontier. nextLookahead skips over whitespace and comments.
void Scanner::nextLookahead(TokenStore::iterator& pos) {
for (;;) {
++pos;
if (pos == m_lookahead.end()) {
pos = m_lookahead.appendNew();
pos->loc = *m_loc;
pos->t = scanToken(pos->token, pos->loc);
}
switch (pos->t) {
case T_DOC_COMMENT:
case T_COMMENT:
case T_OPEN_TAG:
case T_WHITESPACE:
break;
default:
return;
}
}
}
bool Scanner::tryParseTypeList(TokenStore::iterator& pos) {
for (;;) {
if (!tryParseNSType(pos)) return false;
if (pos->t == T_AS) {
nextLookahead(pos);
if (!tryParseNSType(pos)) return false;
}
if (pos->t != ',') return true;
nextLookahead(pos);
}
}
bool Scanner::tryParseFuncTypeList(TokenStore::iterator& pos) {
for (;;) {
if (pos->t == T_VARARG) {
nextLookahead(pos);
return true;
}
if (!tryParseNSType(pos)) return false;
if (pos->t != ',') return true;
nextLookahead(pos);
}
}
bool
Scanner::tryParseNSType(TokenStore::iterator& pos) {
if (pos->t == '@') {
nextLookahead(pos);
}
if (pos->t == '?') {
nextLookahead(pos);
}
if (pos->t == '(') {
nextLookahead(pos);
if (pos->t == T_FUNCTION) {
nextLookahead(pos);
if (pos->t != '(') return false;
nextLookahead(pos);
if (pos->t != ')') {
if (!tryParseFuncTypeList(pos)) return false;
if (pos->t != ')') return false;
}
nextLookahead(pos);
if (pos->t == ')') {
nextLookahead(pos);
return true;
}
if (pos->t != ':') return false;
nextLookahead(pos);
if (!tryParseNSType(pos)) return false;
if (pos->t != ')') return false;
nextLookahead(pos);
return true;
}
if (!tryParseTypeList(pos)) return false;
if (pos->t != ')') return false;
nextLookahead(pos);
return true;
}
if (pos->t == T_NAMESPACE) {
nextLookahead(pos);
if (pos->t != T_NS_SEPARATOR) return false;
nextLookahead(pos);
} else if (pos->t == T_NS_SEPARATOR) {
nextLookahead(pos);
}
for (;;) {
switch (pos->t) {
case T_STRING:
case T_XHP_ATTRIBUTE:
case T_XHP_CATEGORY:
case T_XHP_CHILDREN:
case T_XHP_REQUIRED:
case T_XHP_ENUM:
case T_ARRAY:
nextLookahead(pos);
break;
case T_XHP_LABEL:
nextLookahead(pos);
return true;
default:
return false;
}
if (pos->t == T_UNRESOLVED_LT) {
TokenStore::iterator ltPos = pos;
nextLookahead(pos);
++m_lookaheadLtDepth;
bool isTypeList = tryParseTypeList(pos);
--m_lookaheadLtDepth;
if (!isTypeList || pos->t != '>') {
ltPos->t = '<';
return false;
}
ltPos->t = T_TYPELIST_LT;
pos->t = T_TYPELIST_GT;
nextLookahead(pos);
return true;
}
if (pos->t != T_NS_SEPARATOR) {
return true;
}
nextLookahead(pos);
}
}
int Scanner::getNextToken(ScannerToken &t, Location &l) {
int tokid;
bool la = !m_lookahead.empty();
tokid = fetchToken(t, l);
if (LIKELY(tokid != T_UNRESOLVED_LT)) {
// In the common case, we don't have to perform any resolution
// and we can just return the token
if (UNLIKELY(la)) {
// If we pulled a lookahead token, we need to remove it from
// the lookahead store
m_lookahead.popFront();
}
return tokid;
}
// We encountered a '<' character that needs to be resolved.
if (!la) {
// If this token didn't come from the lookahead store, we
// need to stash it there
TokenStore::iterator it = m_lookahead.appendNew();
LookaheadToken ltd = { t, l, tokid };
*it = ltd;
}
// Look at subsequent tokens to determine if the '<' character
// is the start of a type list
TokenStore::iterator pos = m_lookahead.begin();
TokenStore::iterator ltPos = pos;
nextLookahead(pos);
++m_lookaheadLtDepth;
bool isTypeList = tryParseTypeList(pos);
--m_lookaheadLtDepth;
if (!isTypeList || pos->t != '>') {
ltPos->t = '<';
} else {
ltPos->t = T_TYPELIST_LT;
pos->t = T_TYPELIST_GT;
}
tokid = fetchToken(t, l);
// We pulled a lookahead token, we need to remove it from the
// lookahead store
m_lookahead.popFront();
return tokid;
}
int Scanner::read(char *text, int &result, int max) {
if (m_stream) {
if (!m_stream->eof()) {
m_stream->read(text, max);
if (!m_stream->bad()) {
return (result = m_stream->gcount());
}
}
} else if (m_source) {
if (m_pos < m_len) {
int count = m_len - m_pos;
if (count > max) count = max;
if (count > 0) {
memcpy(text, m_source + m_pos, count);
m_pos += count;
return (result = count);
}
}
}
return (result = 0);
}
void Scanner::error(const char* fmt, ...) {
va_list ap;
va_start(ap, fmt);
Util::string_vsnprintf(m_error, fmt, ap);
va_end(ap);
}
void Scanner::warn(const char* fmt, ...) {
va_list ap;
va_start(ap, fmt);
string msg;
Util::string_vsnprintf(msg, fmt, ap);
va_end(ap);
Logger::Warning("%s: %s (Line: %d, Char %d)", msg.c_str(),
m_filename.c_str(), m_loc->line0, m_loc->char0);
}
void Scanner::incLoc(const char *rawText, int rawLeng) {
assert(rawText);
assert(rawLeng > 0);
switch (m_state) {
case Start:
break; // scanner set to (1, 1, 1, 1) already
case NoLineFeed:
m_loc->line0 = m_loc->line1;
m_loc->char0 = m_loc->char1 + 1;
break;
case HadLineFeed:
m_loc->line0 = m_loc->line1 + 1;
m_loc->char0 = 1;
break;
}
const char *p = rawText;
for (int i = 0; i < rawLeng; i++) {
switch (m_state) {
case Start:
break; // scanner set to (1, 1, 1, 1) already
case NoLineFeed:
m_loc->char1++;
break;
case HadLineFeed:
m_loc->line1++;
m_loc->char1 = 1;
break;
}
m_state = (*p++ == '\n' ? HadLineFeed : NoLineFeed);
}
}
string Scanner::escape(const char *str, int len, char quote_type) const {
string output;
output.reserve(len);
if (quote_type == '\'') {
for (int i = 0; i < len; i++) {
unsigned char ch = str[i];
if (ch == '\\') {
if (++i < len) {
switch (str[i]) {
case '\\': output += "\\"; break;
case '\'': output += '\''; break;
default: {
output += ch;
output += str[i];
break;
}
}
} else {
assert(false);
output += ch;
}
} else {
output += ch;
}
}
} else {
for (int i = 0; i < len; i++) {
unsigned char ch = str[i];
if (ch == '\\') {
if (++i < len) {
switch (str[i]) {
case 'n': output += '\n'; break;
case 't': output += '\t'; break;
case 'r': output += '\r'; break;
case 'v': output += '\v'; break;
case 'f': output += '\f'; break;
case '\\': output += '\\'; break;
case '$': output += '$'; break;
case '"':
if (str[i] != quote_type) {
output += '\\';
}
output += '"';
break;
case 'x':
case 'X': {
if (isxdigit(str[i+1])) {
string shex;
shex += str[++i]; // 0th hex digit
if (isxdigit(str[i+1])) {
shex += str[++i]; // 1st hex digit
}
output += strtol(shex.c_str(), nullptr, 16);
} else {
output += ch;
output += str[i];
}
break;
}
default: {
// check for an octal
if ('0' <= str[i] && str[i] <= '7') {
string soct;
soct += str[i]; // 0th octal digit
if ('0' <= str[i+1] && str[i+1] <= '7') {
soct += str[++i]; // 1st octal digit
if ('0' <= str[i+1] && str[i+1] <= '7') {
soct += str[++i]; // 2nd octal digit
}
}
output += strtol(soct.c_str(), nullptr, 8);
} else {
output += ch;
output += str[i];
}
break;
}
}
} else {
output += ch;
}
} else {
output += ch;
}
}
}
return output;
}
TokenStore::iterator TokenStore::begin() {
if (empty()) {
return end();
}
iterator it;
it.m_slab = m_head;
it.m_pos = m_head->m_beginPos;
return it;
}
TokenStore::iterator TokenStore::end() {
iterator it;
it.m_slab = nullptr;
it.m_pos = 0;
return it;
}
void TokenStore::popFront() {
if (empty()) return;
++m_head->m_beginPos;
if (m_head->m_beginPos < m_head->m_endPos) return;
LookaheadSlab* nextSlab = m_head->m_next;
if (!nextSlab) {
// We just removed the last token from the last slab. We hang on to the
// last slab instead of freeing it so that we don't keep allocating and
// freeing slabs in the common steady state.
m_head->m_beginPos = 0;
m_head->m_endPos = 0;
return;
}
delete m_head;
m_head = nextSlab;
}
TokenStore::iterator TokenStore::appendNew() {
iterator it;
if (m_tail && m_tail->m_endPos < LookaheadSlab::SlabSize) {
it.m_slab = m_tail;
it.m_pos = m_tail->m_endPos;
++m_tail->m_endPos;
return it;
}
LookaheadSlab* newSlab = new LookaheadSlab;
newSlab->m_next = nullptr;
newSlab->m_beginPos = 0;
newSlab->m_endPos = 0;
if (m_tail) {
m_tail->m_next = newSlab;
m_tail = m_tail->m_next;
} else {
m_head = m_tail = newSlab;
}
it.m_slab = m_tail;
it.m_pos = newSlab->m_endPos;
++newSlab->m_endPos;
return it;
}
///////////////////////////////////////////////////////////////////////////////
}