fef62f03a2
Now that HHVM is the default runtime, this namespace doesn't mean anything.
1777 linhas
47 KiB
C++
1777 linhas
47 KiB
C++
/*
|
|
+----------------------------------------------------------------------+
|
|
| HipHop for PHP |
|
|
+----------------------------------------------------------------------+
|
|
| Copyright (c) 2010- Facebook, Inc. (http://www.facebook.com) |
|
|
| Copyright (c) 1997-2010 The PHP Group |
|
|
+----------------------------------------------------------------------+
|
|
| This source file is subject to version 3.01 of the PHP license, |
|
|
| that is bundled with this package in the file LICENSE, and is |
|
|
| available through the world-wide-web at the following url: |
|
|
| http://www.php.net/license/3_01.txt |
|
|
| If you did not receive a copy of the PHP license and are unable to |
|
|
| obtain it through the world-wide-web, please send a note to |
|
|
| license@php.net so we can mail you a copy immediately. |
|
|
+----------------------------------------------------------------------+
|
|
*/
|
|
|
|
#include <runtime/ext/ext_fb.h>
|
|
#include <runtime/base/memory/memory_manager.h>
|
|
#include <runtime/base/util/request_local.h>
|
|
#include <runtime/base/zend/zend_math.h>
|
|
#include <runtime/base/server/server_stats.h>
|
|
#include <runtime/base/ini_setting.h>
|
|
#include <runtime/vm/event_hook.h>
|
|
#include <util/alloc.h>
|
|
#include <util/vdso.h>
|
|
|
|
#ifdef __FreeBSD__
|
|
# include <sys/resource.h>
|
|
# include <sys/cpuset.h>
|
|
# define cpu_set_t cpuset_t
|
|
# define SET_AFFINITY(pid, size, mask) \
|
|
cpuset_setaffinity(CPU_LEVEL_WHICH, CPU_WHICH_TID, -1, size, mask)
|
|
# define GET_AFFINITY(pid, size, mask) \
|
|
cpuset_getaffinity(CPU_LEVEL_WHICH, CPU_WHICH_TID, -1, size, mask)
|
|
#elif __APPLE__
|
|
# include <mach/mach_init.h>
|
|
# include <mach/thread_policy.h>
|
|
# include <mach/thread_act.h>
|
|
|
|
# define cpu_set_t thread_affinity_policy_data_t
|
|
# define CPU_SET(cpu_id, new_mask) \
|
|
(*(new_mask)).affinity_tag = (cpu_id + 1)
|
|
# define CPU_ZERO(new_mask) \
|
|
(*(new_mask)).affinity_tag = THREAD_AFFINITY_TAG_NULL
|
|
# define GET_AFFINITY(pid, size, mask) \
|
|
(*(mask)).affinity_tag = THREAD_AFFINITY_TAG_NULL
|
|
# define SET_AFFINITY(pid, size, mask) \
|
|
thread_policy_set(mach_thread_self(), THREAD_AFFINITY_POLICY, \
|
|
(int *)mask, THREAD_AFFINITY_POLICY_COUNT)
|
|
#else
|
|
# include <sched.h>
|
|
# define SET_AFFINITY(pid, size, mask) sched_setaffinity(0, size, mask)
|
|
# define GET_AFFINITY(pid, size, mask) sched_getaffinity(0, size, mask)
|
|
#endif
|
|
|
|
|
|
#include <iostream>
|
|
#include <fstream>
|
|
#include <zlib.h>
|
|
|
|
// Append the delimiter
|
|
#define HP_STACK_DELIM "==>"
|
|
#define HP_STACK_DELIM_LEN (sizeof(HP_STACK_DELIM) - 1)
|
|
|
|
namespace HPHP {
|
|
IMPLEMENT_DEFAULT_EXTENSION(hotprofiler);
|
|
IMPLEMENT_DEFAULT_EXTENSION(xhprof);
|
|
///////////////////////////////////////////////////////////////////////////////
|
|
// helpers
|
|
|
|
/*
|
|
* A hash function to calculate a 8-bit hash code for a function name.
|
|
* This is based on a small modification to 'zend_inline_hash_func' by summing
|
|
* up all bytes of the ulong returned by 'zend_inline_hash_func'.
|
|
*
|
|
* @param str, char *, string to be calculated hash code for.
|
|
*
|
|
* @author cjiang
|
|
*/
|
|
static inline uint8_t hprof_inline_hash(const char * str) {
|
|
unsigned long h = 5381;
|
|
uint i = 0;
|
|
uint8_t res = 0;
|
|
|
|
while (*str) {
|
|
h += (h << 5);
|
|
h ^= (unsigned long) *str++;
|
|
}
|
|
|
|
for (i = 0; i < sizeof(unsigned long); i++) {
|
|
res += ((uint8_t *)&h)[i];
|
|
}
|
|
return res;
|
|
}
|
|
|
|
/**
|
|
* Get time delta in microseconds.
|
|
*/
|
|
static long get_us_interval(struct timeval *start, struct timeval *end) {
|
|
return (((end->tv_sec - start->tv_sec) * 1000000)
|
|
+ (end->tv_usec - start->tv_usec));
|
|
}
|
|
|
|
/**
|
|
* Incr time with the given microseconds.
|
|
*/
|
|
static void incr_us_interval(struct timeval *start, uint64_t incr) {
|
|
incr += (start->tv_sec * 1000000 + start->tv_usec);
|
|
start->tv_sec = incr/1000000;
|
|
start->tv_usec = incr%1000000;
|
|
return;
|
|
}
|
|
|
|
/**
|
|
* Truncates the given timeval to the nearest slot begin, where
|
|
* the slot size is determined by intr
|
|
*
|
|
* @param tv Input timeval to be truncated in place
|
|
* @param intr Time interval in microsecs - slot width
|
|
* @return void
|
|
* @author veeve
|
|
*/
|
|
static void hp_trunc_time(struct timeval *tv, uint64_t intr) {
|
|
uint64_t time_in_micro;
|
|
|
|
// Convert to microsecs and trunc that first
|
|
time_in_micro = (tv->tv_sec * 1000000) + tv->tv_usec;
|
|
time_in_micro /= intr;
|
|
time_in_micro *= intr;
|
|
|
|
// Update tv
|
|
tv->tv_sec = (time_in_micro / 1000000);
|
|
tv->tv_usec = (time_in_micro % 1000000);
|
|
}
|
|
|
|
///////////////////////////////////////////////////////////////////////////////
|
|
// High precision timer related functions.
|
|
|
|
/**
|
|
* Get time stamp counter (TSC) value via 'rdtsc' instruction.
|
|
*
|
|
* @return 64 bit unsigned integer
|
|
* @author cjiang
|
|
*/
|
|
inline uint64_t tsc() {
|
|
#ifdef __x86_64__
|
|
uint32_t __a,__d;
|
|
uint64_t val;
|
|
asm volatile("rdtsc" : "=a" (__a), "=d" (__d));
|
|
(val) = ((uint64_t)__a) | (((uint64_t)__d)<<32);
|
|
return val;
|
|
#else
|
|
// TODO(2200461): rdtsc isn't portable. Ideally we'd use some higher-level
|
|
// portable API (clock_gettime maybe?), but that may break assumptions that
|
|
// clients of this API make about how the underlying clock works.
|
|
return 0;
|
|
#endif
|
|
}
|
|
|
|
/**
|
|
* This is a microbenchmark to get cpu frequency the process is running on. The
|
|
* returned value is used to convert TSC counter values to microseconds.
|
|
*
|
|
* @return int64.
|
|
* @author cjiang
|
|
*/
|
|
static int64_t get_cpu_frequency() {
|
|
struct timeval start;
|
|
struct timeval end;
|
|
|
|
if (gettimeofday(&start, 0)) {
|
|
perror("gettimeofday");
|
|
return 0.0;
|
|
}
|
|
uint64_t tsc_start = tsc();
|
|
// Sleep for 5 miliseconds. Comparaing with gettimeofday's few microseconds
|
|
// execution time, this should be enough.
|
|
usleep(5000);
|
|
if (gettimeofday(&end, 0)) {
|
|
perror("gettimeofday");
|
|
return 0.0;
|
|
}
|
|
uint64_t tsc_end = tsc();
|
|
return nearbyint((tsc_end - tsc_start) * 1.0
|
|
/ (get_us_interval(&start, &end)));
|
|
}
|
|
|
|
#define MAX_LINELENGTH 1024
|
|
|
|
static int64_t* get_cpu_frequency_from_file(const char *file, int ncpus)
|
|
{
|
|
std::ifstream cpuinfo(file);
|
|
if (cpuinfo.fail()) {
|
|
return NULL;
|
|
}
|
|
char line[MAX_LINELENGTH];
|
|
int64_t* freqs = new int64_t[ncpus];
|
|
for (int i = 0; i < ncpus; ++i) {
|
|
freqs[i] = 0;
|
|
}
|
|
int processor = -1;
|
|
|
|
while (cpuinfo.getline(line, sizeof(line))) {
|
|
if (sscanf(line, "processor : %d", &processor) == 1) {
|
|
continue;
|
|
}
|
|
float freq;
|
|
if (sscanf(line, "cpu MHz : %f", &freq) == 1) {
|
|
if (processor != -1 && processor < ncpus) {
|
|
freqs[processor] = nearbyint(freq);
|
|
processor = -1;
|
|
}
|
|
}
|
|
}
|
|
for (int i = 0; i < ncpus; ++i) {
|
|
if (freqs[i] == 0) {
|
|
delete[] freqs;
|
|
return NULL;
|
|
}
|
|
}
|
|
return freqs;
|
|
}
|
|
|
|
class esyscall {
|
|
public:
|
|
int num;
|
|
|
|
explicit esyscall(const char *syscall_name) {
|
|
num = -1;
|
|
char format[strlen(syscall_name) + sizeof(" %d")];
|
|
sprintf(format, "%s %%d", syscall_name);
|
|
|
|
std::ifstream syscalls("/proc/esyscall");
|
|
if (syscalls.fail()) {
|
|
return;
|
|
}
|
|
char line[MAX_LINELENGTH];
|
|
if (!syscalls.getline(line, sizeof(line))) {
|
|
return;
|
|
}
|
|
// perhaps we should check the format, but we're just going to assume
|
|
// Name Number
|
|
while (syscalls.getline(line, sizeof(line))) {
|
|
int number;
|
|
if (sscanf(line, format, &number) == 1) {
|
|
num = number;
|
|
return;
|
|
}
|
|
}
|
|
}
|
|
};
|
|
|
|
///////////////////////////////////////////////////////////////////////////////
|
|
// Machine information that we collect just once.
|
|
|
|
class MachineInfo {
|
|
public:
|
|
/**
|
|
* Bind the current process to a specified CPU. This function is to ensure
|
|
* that the OS won't schedule the process to different processors, which
|
|
* would make values read by rdtsc unreliable.
|
|
*
|
|
* @param uint32 cpu_id, the id of the logical cpu to be bound to.
|
|
*
|
|
* @author cjiang
|
|
*/
|
|
static void BindToCPU(uint32_t cpu_id) {
|
|
cpu_set_t new_mask;
|
|
CPU_ZERO(&new_mask);
|
|
CPU_SET(cpu_id, &new_mask);
|
|
SET_AFFINITY(0, sizeof(cpu_set_t), &new_mask);
|
|
}
|
|
|
|
public:
|
|
// The number of logical CPUs this machine has.
|
|
int m_cpu_num;
|
|
// Store the cpu frequency. Get it from /proc/cpuinfo if we can.
|
|
int64_t* m_cpu_frequencies;
|
|
|
|
MachineInfo() {
|
|
m_cpu_num = sysconf(_SC_NPROCESSORS_CONF);
|
|
m_cpu_frequencies = get_cpu_frequency_from_file("/proc/cpuinfo", m_cpu_num);
|
|
|
|
if (m_cpu_frequencies)
|
|
return;
|
|
|
|
m_cpu_frequencies = new int64_t[m_cpu_num];
|
|
for (int i = 0; i < m_cpu_num; i++) {
|
|
cpu_set_t prev_mask;
|
|
GET_AFFINITY(0, sizeof(cpu_set_t), &prev_mask);
|
|
BindToCPU(i);
|
|
// Make sure the current process gets scheduled to the target cpu. This
|
|
// might not be necessary though.
|
|
usleep(0);
|
|
m_cpu_frequencies[i] = get_cpu_frequency();
|
|
SET_AFFINITY(0, sizeof(cpu_set_t), &prev_mask);
|
|
}
|
|
}
|
|
|
|
~MachineInfo() {
|
|
delete[] m_cpu_frequencies;
|
|
}
|
|
};
|
|
static MachineInfo s_machine;
|
|
|
|
static inline uint64_t
|
|
tv_to_cycles(const struct timeval& tv, int64_t MHz)
|
|
{
|
|
return (((uint64_t)tv.tv_sec * 1000000) + tv.tv_usec) * MHz;
|
|
}
|
|
|
|
static inline uint64_t
|
|
to_usec(int64_t cycles, int64_t MHz, bool cpu_time = false)
|
|
{
|
|
static int64_t vdso_usable =
|
|
Util::Vdso::ClockGetTimeNS(CLOCK_THREAD_CPUTIME_ID);
|
|
|
|
if (cpu_time && vdso_usable >= 0)
|
|
return cycles / 1000;
|
|
return (cycles + MHz/2) / MHz;
|
|
}
|
|
|
|
static esyscall vtsc_syscall("vtsc");
|
|
|
|
static inline uint64_t vtsc(int64_t MHz) {
|
|
int64_t rval = Util::Vdso::ClockGetTimeNS(CLOCK_THREAD_CPUTIME_ID);
|
|
if (rval >= 0) {
|
|
return rval;
|
|
}
|
|
if (vtsc_syscall.num > 0) {
|
|
return syscall(vtsc_syscall.num);
|
|
}
|
|
struct rusage usage;
|
|
getrusage(RUSAGE_SELF, &usage);
|
|
return
|
|
tv_to_cycles(usage.ru_utime, MHz) + tv_to_cycles(usage.ru_stime, MHz);
|
|
}
|
|
|
|
#ifdef USE_JEMALLOC
|
|
|
|
#define JEMALLOC_STAT_MIB_LEN 2
|
|
size_t mallctl_mib_len = JEMALLOC_STAT_MIB_LEN;
|
|
size_t *mallctl_alloc_mib;
|
|
size_t *mallctl_free_mib;
|
|
bool mallctl_init = false;
|
|
|
|
bool
|
|
mallctl_mib_init()
|
|
{
|
|
if (mallctl_init) {
|
|
return false;
|
|
}
|
|
mallctl_init = true;
|
|
mallctl_alloc_mib = new size_t[mallctl_mib_len];
|
|
if (mallctlnametomib("thread.allocated",
|
|
mallctl_alloc_mib, &mallctl_mib_len))
|
|
{
|
|
return false;
|
|
}
|
|
mallctl_free_mib = new size_t[mallctl_mib_len];
|
|
if (mallctlnametomib("thread.deallocated",
|
|
mallctl_free_mib, &mallctl_mib_len))
|
|
{
|
|
return false;
|
|
}
|
|
return true;
|
|
}
|
|
#endif
|
|
|
|
uint64_t
|
|
get_allocs()
|
|
{
|
|
#ifdef USE_JEMALLOC
|
|
if (mallctl) {
|
|
if (!mallctl_alloc_mib) {
|
|
if (!mallctl_mib_init()) {
|
|
return 0;
|
|
}
|
|
}
|
|
uint64_t stat;
|
|
size_t size = sizeof(stat);
|
|
if (mallctlbymib(mallctl_alloc_mib, mallctl_mib_len, &stat, &size, NULL, 0))
|
|
{
|
|
return 0;
|
|
}
|
|
return stat;
|
|
}
|
|
#endif
|
|
#ifdef USE_TCMALLOC
|
|
if (MallocExtensionInstance) {
|
|
size_t stat;
|
|
MallocExtensionInstance()->GetNumericProperty(
|
|
"generic.thread_bytes_allocated", &stat);
|
|
return stat;
|
|
}
|
|
#endif
|
|
return 0;
|
|
}
|
|
|
|
uint64_t
|
|
get_frees()
|
|
{
|
|
#ifdef USE_JEMALLOC
|
|
if (mallctl) {
|
|
if (!mallctl_free_mib) {
|
|
if (!mallctl_mib_init()) {
|
|
return 0;
|
|
}
|
|
}
|
|
uint64_t stat;
|
|
size_t size = sizeof(stat);
|
|
if (mallctlbymib(mallctl_free_mib, mallctl_mib_len, &stat, &size, NULL, 0))
|
|
{
|
|
return 0;
|
|
}
|
|
return stat;
|
|
}
|
|
#endif
|
|
#ifdef USE_TCMALLOC
|
|
if (MallocExtensionInstance) {
|
|
size_t stat;
|
|
MallocExtensionInstance()->GetNumericProperty(
|
|
"generic.thread_bytes_freed", &stat);
|
|
return stat;
|
|
}
|
|
#endif
|
|
return 0;
|
|
}
|
|
|
|
///////////////////////////////////////////////////////////////////////////////
|
|
// classes
|
|
|
|
/**
|
|
* All information we collect about a frame.
|
|
*/
|
|
class Frame {
|
|
public:
|
|
Frame *m_parent; // ptr to parent frame
|
|
const char *m_name; // function name
|
|
uint8_t m_hash_code; // hash_code for the function name
|
|
int m_recursion; // recursion level for function
|
|
|
|
uint64_t m_tsc_start; // start value for TSC counter
|
|
int64_t m_mu_start; // memory usage
|
|
int64_t m_pmu_start; // peak memory usage
|
|
int64_t m_vtsc_start; // user/sys time start
|
|
|
|
/**
|
|
* Returns formatted function name
|
|
*
|
|
* @param result_buf ptr to result buf
|
|
* @param result_len max size of result buf
|
|
* @return total size of the function name returned in result_buf
|
|
* @author veeve
|
|
*/
|
|
size_t getName(char *result_buf, size_t result_len) {
|
|
if (result_len <= 1) {
|
|
return 0; // Insufficient result_bug. Bail!
|
|
}
|
|
|
|
// Add '@recurse_level' if required
|
|
// NOTE: Dont use snprintf's return val as it is compiler dependent
|
|
if (m_recursion) {
|
|
snprintf(result_buf, result_len, "%s@%d", m_name, m_recursion);
|
|
} else {
|
|
snprintf(result_buf, result_len, "%s", m_name);
|
|
}
|
|
|
|
// Force null-termination at MAX
|
|
result_buf[result_len - 1] = 0;
|
|
return strlen(result_buf);
|
|
}
|
|
|
|
/**
|
|
* Build a caller qualified name for a callee.
|
|
*
|
|
* For example, if A() is caller for B(), then it returns "A==>B".
|
|
* Recursive invokations are denoted with @<n> where n is the recursion
|
|
* depth.
|
|
*
|
|
* For example, "foo==>foo@1", and "foo@2==>foo@3" are examples of direct
|
|
* recursion. And "bar==>foo@1" is an example of an indirect recursive
|
|
* call to foo (implying the foo() is on the call stack some levels
|
|
* above).
|
|
*/
|
|
size_t getStack(int level, char *result_buf, size_t result_len) {
|
|
// End recursion if we dont need deeper levels or
|
|
// we dont have any deeper levels
|
|
if (!m_parent || level <= 1) {
|
|
return getName(result_buf, result_len);
|
|
}
|
|
|
|
// Take care of all ancestors first
|
|
size_t len = m_parent->getStack(level - 1, result_buf, result_len);
|
|
if (result_len < (len + HP_STACK_DELIM_LEN)) {
|
|
return len; // Insufficient result_buf. Bail out!
|
|
}
|
|
|
|
// Add delimiter only if entry had ancestors
|
|
if (len) {
|
|
strncat(result_buf + len, HP_STACK_DELIM, result_len - len);
|
|
len += HP_STACK_DELIM_LEN;
|
|
}
|
|
|
|
// Append the current function name
|
|
return len + getName(result_buf + len, result_len - len);
|
|
}
|
|
};
|
|
|
|
enum Flag {
|
|
TrackBuiltins = 0x1,
|
|
TrackCPU = 0x2,
|
|
TrackMemory = 0x4,
|
|
TrackVtsc = 0x8,
|
|
XhpTrace = 0x10,
|
|
MeasureXhprofDisable = 0x20,
|
|
GetTrace = 0x40,
|
|
TrackMalloc = 0x80,
|
|
};
|
|
|
|
static const StaticString s_ct("ct");
|
|
static const StaticString s_wt("wt");
|
|
static const StaticString s_cpu("cpu");
|
|
static const StaticString s_mu("mu");
|
|
static const StaticString s_pmu("pmu");
|
|
static const StaticString s_alloc("alloc");
|
|
static const StaticString s_free("free");
|
|
static const StaticString s_compressed_trace("(compressed_trace)");
|
|
|
|
/**
|
|
* Maintain profiles of a running stack.
|
|
*/
|
|
class Profiler {
|
|
public:
|
|
Profiler() : m_successful(true), m_stack(NULL), m_frame_free_list(NULL) {
|
|
if (!s_rand_initialized) {
|
|
s_rand_initialized = true;
|
|
srand(math_generate_seed());
|
|
}
|
|
|
|
// bind to a random cpu so that we can use rdtsc instruction.
|
|
int cur_cpu_id = rand() % s_machine.m_cpu_num;
|
|
GET_AFFINITY(0, sizeof(cpu_set_t), &m_prev_mask);
|
|
MachineInfo::BindToCPU(cur_cpu_id);
|
|
m_MHz = s_machine.m_cpu_frequencies[cur_cpu_id];
|
|
|
|
memset(m_func_hash_counters, 0, sizeof(m_func_hash_counters));
|
|
}
|
|
|
|
virtual ~Profiler() {
|
|
SET_AFFINITY(0, sizeof(cpu_set_t), &m_prev_mask);
|
|
|
|
endAllFrames();
|
|
for (Frame *p = m_frame_free_list; p;) {
|
|
Frame *cur = p;
|
|
p = p->m_parent;
|
|
free(cur);
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Subclass can do extra work by overriding these two virtual functions.
|
|
*/
|
|
virtual void beginFrameEx() {} // called right before a function call
|
|
virtual void endFrameEx() {} // called right after a function is finished
|
|
|
|
/**
|
|
* Final results.
|
|
*/
|
|
virtual void writeStats(Array &ret) {}
|
|
|
|
/**
|
|
* Start a new frame with the specified symbol.
|
|
*/
|
|
virtual void beginFrame(const char *symbol) __attribute__ ((noinline)) ;
|
|
|
|
/**
|
|
* End top of the stack.
|
|
*/
|
|
virtual void endFrame(bool endMain = false) __attribute__ ((noinline)) ;
|
|
|
|
virtual void endAllFrames() {
|
|
while (m_stack) {
|
|
endFrame(true);
|
|
}
|
|
}
|
|
|
|
template<class phpret, class Name, class Counts>
|
|
static void returnVals(phpret& ret, const Name& name, const Counts& counts,
|
|
int flags, int64_t MHz)
|
|
{
|
|
ArrayInit arr(5);
|
|
arr.set(s_ct, counts.count);
|
|
arr.set(s_wt, to_usec(counts.wall_time, MHz));
|
|
if (flags & TrackCPU) {
|
|
arr.set(s_cpu, to_usec(counts.cpu, MHz, true));
|
|
}
|
|
if (flags & TrackMemory) {
|
|
arr.set(s_mu, counts.memory);
|
|
arr.set(s_pmu, counts.peak_memory);
|
|
} else if (flags & TrackMalloc) {
|
|
arr.set(s_alloc, counts.memory);
|
|
arr.set(s_free, counts.peak_memory);
|
|
}
|
|
ret.set(String(name), arr.create());
|
|
}
|
|
|
|
template<class phpret, class StatsMap>
|
|
static bool extractStats(phpret& ret, StatsMap& stats, int flags, int64_t MHz)
|
|
{
|
|
for (typename StatsMap::const_iterator iter = stats.begin();
|
|
iter != stats.end(); ++iter) {
|
|
returnVals(ret, iter->first, iter->second, flags, MHz);
|
|
}
|
|
return true;
|
|
}
|
|
|
|
bool m_successful;
|
|
|
|
int64_t m_MHz; // cpu freq for either the local cpu or the saved trace
|
|
Frame *m_stack; // top of the profile stack
|
|
|
|
static bool s_rand_initialized;
|
|
|
|
cpu_set_t m_prev_mask; // saved cpu affinity
|
|
Frame *m_frame_free_list; // freelist of Frame
|
|
uint8_t m_func_hash_counters[256]; // counter table by hash code;
|
|
|
|
/**
|
|
* Fast allocate a Frame structure. Picks one from the
|
|
* free list if available, else does an actual allocate.
|
|
*/
|
|
Frame *createFrame(const char *symbol) {
|
|
Frame *p = m_frame_free_list;
|
|
if (p) {
|
|
m_frame_free_list = p->m_parent;
|
|
} else {
|
|
p = (Frame*)malloc(sizeof(Frame));
|
|
}
|
|
p->m_parent = m_stack;
|
|
p->m_name = symbol;
|
|
p->m_hash_code = hprof_inline_hash(symbol);
|
|
m_stack = p;
|
|
return p;
|
|
}
|
|
|
|
/**
|
|
* Fast free a Frame structure. Simply returns back the Frame to a free list
|
|
* and doesn't actually perform the free.
|
|
*/
|
|
void releaseFrame() {
|
|
assert(m_stack);
|
|
|
|
Frame *p = m_stack;
|
|
m_stack = p->m_parent;
|
|
p->m_parent = m_frame_free_list; // we overload the m_parent field here
|
|
m_frame_free_list = p;
|
|
}
|
|
};
|
|
bool Profiler::s_rand_initialized = false;
|
|
|
|
void Profiler::beginFrame(const char *symbol) {
|
|
Frame *current = createFrame(symbol);
|
|
|
|
// NOTE(cjiang): use hash code to fend off most of call-stack traversal
|
|
int recursion_level = 0;
|
|
if (m_func_hash_counters[current->m_hash_code] > 0) {
|
|
// Find this symbols recurse level
|
|
for (Frame *p = current->m_parent; p; p = p->m_parent) {
|
|
if (strcmp(current->m_name, p->m_name) == 0) {
|
|
recursion_level = p->m_recursion + 1;
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
current->m_recursion = recursion_level;
|
|
|
|
m_func_hash_counters[current->m_hash_code]++;
|
|
beginFrameEx();
|
|
}
|
|
|
|
/**
|
|
* End top of the stack.
|
|
*/
|
|
void Profiler::endFrame(bool endMain) {
|
|
if (m_stack) {
|
|
// special case for main() frame that's only ended by endAllFrames()
|
|
if (!endMain && m_stack->m_parent == NULL) {
|
|
return;
|
|
}
|
|
endFrameEx();
|
|
m_func_hash_counters[m_stack->m_hash_code]--;
|
|
releaseFrame();
|
|
}
|
|
}
|
|
|
|
///////////////////////////////////////////////////////////////////////////////
|
|
// SimpleProfiler
|
|
|
|
/**
|
|
* vtsc() based profiler, but simple enough to print basic information.
|
|
*
|
|
* When available, we now use the vtsc() call, which is relatively inexpensive
|
|
* and accurate. It's still a system call, but a cheap one. If the call isn't
|
|
* available, the comment below still applies. --renglish
|
|
*
|
|
* COMMENT(cjiang): getrusage is very expensive and inaccurate. It is based
|
|
* on sampling at the rate about once every 5 to 10 miliseconds. The sampling
|
|
* error can be very significantly, especially given that we are
|
|
* instrumenting at a very fine granularity. (every PHP function call will
|
|
* lead to one invokation of getrusage.) Most PHP functions such as the
|
|
* built-ins typically finish in microseconds. Thus the result we get from
|
|
* getrusage is very likely going to be skewed. Also worth noting that
|
|
* getrusage actually is a system call, which involves expensive swapping
|
|
* between user-mode and kernel mode. I would suggest we remove collecting
|
|
* CPU usage all together, as exclusive wall-time is very useful already.
|
|
* Or at least we should make it an opt-in choice.
|
|
*
|
|
* See: http://ww2.cs.fsu.edu/~hines/present/timing_linux.pdf
|
|
*
|
|
* Above is a nice paper talking about the overhead and the inaccuracy problem
|
|
* associated with getrusage.
|
|
*/
|
|
class SimpleProfiler : public Profiler {
|
|
private:
|
|
class CountMap {
|
|
public:
|
|
CountMap() : count(0), tsc(0), vtsc(0) {}
|
|
|
|
int64_t count;
|
|
int64_t tsc;
|
|
int64_t vtsc;
|
|
};
|
|
typedef hphp_hash_map<std::string, CountMap, string_hash> StatsMap;
|
|
StatsMap m_stats; // outcome
|
|
|
|
public:
|
|
SimpleProfiler() {
|
|
print("<div style='display:none'>");
|
|
}
|
|
|
|
~SimpleProfiler() {
|
|
print("</div>");
|
|
print_output();
|
|
}
|
|
|
|
virtual void beginFrameEx() {
|
|
m_stack->m_tsc_start = tsc();
|
|
m_stack->m_vtsc_start = vtsc(m_MHz);
|
|
}
|
|
|
|
virtual void endFrameEx() {
|
|
CountMap &counts = m_stats[m_stack->m_name];
|
|
counts.count++;
|
|
counts.tsc += tsc() - m_stack->m_tsc_start;
|
|
counts.vtsc += vtsc(m_MHz) - m_stack->m_vtsc_start;
|
|
}
|
|
|
|
private:
|
|
void print_output() {
|
|
print("<link rel='stylesheet' href='/css/hotprofiler.css' type='text/css'>"
|
|
"<script language='javascript' src='/js/hotprofiler.js'></script>"
|
|
"<p><center><h2>Hotprofiler Data</h2></center><br>"
|
|
"<div id='hotprofiler_stats'></div>"
|
|
"<script language='javascript'>hotprofiler_data = [");
|
|
for (StatsMap::const_iterator iter = m_stats.begin();
|
|
iter != m_stats.end(); ++iter) {
|
|
print("{\"fn\": \"");
|
|
print(iter->first.c_str());
|
|
print("\"");
|
|
|
|
const CountMap &counts = iter->second;
|
|
|
|
char buf[512];
|
|
snprintf(
|
|
buf, sizeof(buf),
|
|
",\"ct\": %" PRId64 ",\"wt\": %" PRId64 ",\"ut\": %" PRId64 ",\"st\": 0",
|
|
counts.count, (int64_t)to_usec(counts.tsc, m_MHz),
|
|
(int64_t)to_usec(counts.vtsc, m_MHz, true));
|
|
print(buf);
|
|
|
|
print("},\n");
|
|
}
|
|
print("]; write_data('ut', false);</script><br><br> <br>");
|
|
}
|
|
};
|
|
|
|
///////////////////////////////////////////////////////////////////////////////
|
|
// HierarchicalProfiler
|
|
|
|
class HierarchicalProfiler : public Profiler {
|
|
private:
|
|
class CountMap {
|
|
public:
|
|
CountMap() : count(0), wall_time(0), cpu(0), memory(0), peak_memory(0) {}
|
|
|
|
int64_t count;
|
|
int64_t wall_time;
|
|
int64_t cpu;
|
|
int64_t memory;
|
|
int64_t peak_memory;
|
|
};
|
|
typedef hphp_hash_map<std::string, CountMap, string_hash> StatsMap;
|
|
StatsMap m_stats; // outcome
|
|
|
|
public:
|
|
|
|
public:
|
|
explicit HierarchicalProfiler(int flags) : m_flags(flags) {
|
|
}
|
|
|
|
virtual void beginFrameEx() {
|
|
m_stack->m_tsc_start = tsc();
|
|
|
|
if (m_flags & TrackCPU) {
|
|
m_stack->m_vtsc_start = vtsc(m_MHz);
|
|
}
|
|
|
|
if (m_flags & TrackMemory) {
|
|
MemoryManager *mm = MemoryManager::TheMemoryManager();
|
|
const MemoryUsageStats &stats = mm->getStats(true);
|
|
m_stack->m_mu_start = stats.usage;
|
|
m_stack->m_pmu_start = stats.peakUsage;
|
|
} else if (m_flags & TrackMalloc) {
|
|
m_stack->m_mu_start = get_allocs();
|
|
m_stack->m_pmu_start = get_frees();
|
|
}
|
|
}
|
|
|
|
virtual void endFrameEx() {
|
|
char symbol[512];
|
|
m_stack->getStack(2, symbol, sizeof(symbol));
|
|
CountMap &counts = m_stats[symbol];
|
|
counts.count++;
|
|
counts.wall_time += tsc() - m_stack->m_tsc_start;
|
|
|
|
if (m_flags & TrackCPU) {
|
|
counts.cpu += vtsc(m_MHz) - m_stack->m_vtsc_start;
|
|
}
|
|
|
|
if (m_flags & TrackMemory) {
|
|
MemoryManager *mm = MemoryManager::TheMemoryManager();
|
|
const MemoryUsageStats &stats = mm->getStats(true);
|
|
int64_t mu_end = stats.usage;
|
|
int64_t pmu_end = stats.peakUsage;
|
|
counts.memory += mu_end - m_stack->m_mu_start;
|
|
counts.peak_memory += pmu_end - m_stack->m_pmu_start;
|
|
} else if (m_flags & TrackMalloc) {
|
|
counts.memory += get_allocs() - m_stack->m_mu_start;
|
|
counts.peak_memory += get_frees() - m_stack->m_pmu_start;
|
|
}
|
|
}
|
|
|
|
virtual void writeStats(Array &ret) {
|
|
extractStats(ret, m_stats, m_flags, m_MHz);
|
|
}
|
|
|
|
private:
|
|
uint32_t m_flags;
|
|
};
|
|
|
|
template <class TraceIt, class Stats>
|
|
class walkTraceClass {
|
|
public:
|
|
struct Frame {
|
|
TraceIt trace;
|
|
int level;
|
|
int len;
|
|
};
|
|
typedef vector<std::pair<char *, int> >Recursion;
|
|
vector<std::pair<char *, int> >recursion;
|
|
vector<Frame> stack;
|
|
|
|
walkTraceClass() : arc_buff_len(200), arc_buff((char*)malloc(200)) {};
|
|
|
|
~walkTraceClass() {
|
|
free((void*)arc_buff);
|
|
if (recursion.size() > 1) {
|
|
Recursion::iterator r_it = recursion.begin();
|
|
while (++r_it != recursion.end()) {
|
|
delete[] r_it->first;
|
|
}
|
|
}
|
|
}
|
|
|
|
int arc_buff_len;
|
|
char *arc_buff;
|
|
|
|
void checkArcBuff(int len) {
|
|
len = 2*len + HP_STACK_DELIM_LEN + 2;
|
|
if (len >= arc_buff_len) {
|
|
arc_buff_len *= 2;
|
|
arc_buff = (char *)realloc(arc_buff, arc_buff_len);
|
|
if (arc_buff == NULL) {
|
|
throw std::bad_alloc();
|
|
}
|
|
}
|
|
}
|
|
|
|
void incStats(const char *arc, TraceIt tr, const Frame& fr, Stats& stats)
|
|
{
|
|
typename Stats::mapped_type& st = stats[arc];
|
|
++st.count;
|
|
st.wall_time += tr->wall_time - fr.trace->wall_time;
|
|
st.cpu += tr->cpu - fr.trace->cpu;
|
|
st.memory += tr->memory - fr.trace->memory;
|
|
st.peak_memory += tr->peak_memory - fr.trace->peak_memory;
|
|
}
|
|
|
|
void popFrame(TraceIt tIt, std::vector<Frame>& stack, Stats& stats)
|
|
{
|
|
Frame callee = stack.back();
|
|
stack.pop_back();
|
|
const char* arc;
|
|
Frame& caller = stack.back();
|
|
char *cp = arc_buff;
|
|
memcpy(cp, caller.trace->symbol.ptr, caller.len);
|
|
cp += caller.len;
|
|
if (caller.level >= 1) {
|
|
std::pair<char *, int>& lvl = recursion[caller.level];
|
|
memcpy(cp, lvl.first, lvl.second);
|
|
cp += lvl.second;
|
|
}
|
|
memcpy(cp, HP_STACK_DELIM, HP_STACK_DELIM_LEN);
|
|
cp += HP_STACK_DELIM_LEN;
|
|
memcpy(cp, callee.trace->symbol.ptr, callee.len);
|
|
cp += callee.len;
|
|
if (callee.level >= 1) {
|
|
std::pair<char *, int>& lvl = recursion[callee.level];
|
|
memcpy(cp, lvl.first, lvl.second);
|
|
cp += lvl.second;
|
|
}
|
|
*cp = 0;
|
|
arc = arc_buff;
|
|
incStats(arc, tIt, callee, stats);
|
|
}
|
|
|
|
void walk(TraceIt begin, TraceIt end, Stats& stats,
|
|
std::map<const char *, unsigned> &functionLevel)
|
|
{
|
|
if (begin == end) {
|
|
return;
|
|
}
|
|
recursion.push_back(std::make_pair((char *)NULL, 0));
|
|
while (begin != end && !begin->symbol.ptr) {
|
|
++begin;
|
|
}
|
|
while (begin != end) {
|
|
if (begin->symbol.ptr) {
|
|
unsigned level = ++functionLevel[begin->symbol.ptr];
|
|
if (level >= recursion.size()) {
|
|
char *level_string = new char[8];
|
|
sprintf(level_string, "@%u", level);
|
|
recursion.push_back(std::make_pair(level_string,
|
|
strlen(level_string)));
|
|
}
|
|
Frame fr;
|
|
fr.trace = begin;
|
|
fr.level = level - 1;
|
|
fr.len = strlen(begin->symbol.ptr);
|
|
checkArcBuff(fr.len);
|
|
stack.push_back(fr);
|
|
} else if (stack.size() > 1) {
|
|
--functionLevel[stack.back().trace->symbol.ptr];
|
|
popFrame(begin, stack, stats);
|
|
}
|
|
++begin;
|
|
}
|
|
--begin;
|
|
while (stack.size() > 1) {
|
|
popFrame(begin, stack, stats);
|
|
}
|
|
if (!stack.empty()) {
|
|
incStats(stack.back().trace->symbol.ptr, begin, stack.back(), stats);
|
|
}
|
|
}
|
|
};
|
|
|
|
template<class TraceIt, class Stats, class Fmap>
|
|
static void
|
|
walkTrace(TraceIt begin, TraceIt end,
|
|
Stats& stats,
|
|
Fmap &map)
|
|
{
|
|
walkTraceClass<TraceIt, Stats>walker;
|
|
walker.walk(begin, end, stats, map);
|
|
}
|
|
|
|
struct TraceData {
|
|
int64_t wall_time;
|
|
int64_t cpu;
|
|
int64_t memory;
|
|
int64_t peak_memory;
|
|
|
|
void clear() {
|
|
wall_time = cpu = memory = peak_memory = 0;
|
|
}
|
|
void set(int64_t w, int64_t c, int64_t m, int64_t p) {
|
|
wall_time = w;
|
|
cpu = c;
|
|
memory = m;
|
|
peak_memory = p;
|
|
}
|
|
};
|
|
|
|
struct TraceEntry : TraceData {
|
|
union {
|
|
int64_t index;
|
|
const char *ptr;
|
|
} symbol;
|
|
|
|
void rebase(const TraceData& base_vals) {
|
|
wall_time -= base_vals.wall_time;
|
|
cpu -= base_vals.cpu;
|
|
memory -= base_vals.memory;
|
|
peak_memory -= base_vals.peak_memory;
|
|
}
|
|
};
|
|
|
|
///////////////////////////////////////////////////////////////////////////////
|
|
// TraceProfiler
|
|
|
|
static char *xhprof_trace_header = "xhprof trace v0\n%d bytes\n";
|
|
static char *xhprof_trace_speed = "%d MHz\n";
|
|
|
|
class TraceProfiler : public Profiler {
|
|
public:
|
|
|
|
static TraceEntry *s_trace;
|
|
static int s_n_backing;
|
|
int nTrace;
|
|
bool full;
|
|
int64_t maxTraceBuffer;
|
|
int64_t overflowCalls;
|
|
|
|
bool trace_space_available() {
|
|
// the two slots are reserved for internal use
|
|
return nTrace < s_n_backing - 3;
|
|
}
|
|
|
|
bool trace_get_space() {
|
|
bool track_realloc = FALSE;
|
|
if (full) {
|
|
overflowCalls++;
|
|
return false;
|
|
}
|
|
int new_array_size;
|
|
if (s_n_backing == 0) {
|
|
new_array_size = RuntimeOption::ProfilerTraceBuffer;
|
|
} else {
|
|
new_array_size = s_n_backing * RuntimeOption::ProfilerTraceExpansion;
|
|
if (maxTraceBuffer != 0 && new_array_size > maxTraceBuffer) {
|
|
new_array_size = maxTraceBuffer > s_n_backing ?
|
|
maxTraceBuffer : s_n_backing;
|
|
}
|
|
if (new_array_size - nTrace <= 5) {
|
|
// for this operation to succeed, we need room for the entry we're
|
|
// adding, two realloc entries, and two entries to mark the end of
|
|
// the trace.
|
|
full = true;
|
|
collectStats("(trace buffer terminated)", s_trace[nTrace++]);
|
|
return false;
|
|
}
|
|
track_realloc = TRUE;
|
|
}
|
|
{
|
|
DECLARE_THREAD_INFO
|
|
MemoryManager::MaskAlloc masker(info->m_mm);
|
|
TraceEntry *r = (TraceEntry*)realloc((void *)s_trace,
|
|
new_array_size * sizeof(TraceEntry));
|
|
|
|
if (!r) {
|
|
full = true;
|
|
if (s_trace) {
|
|
collectStats("(trace buffer terminated)", s_trace[nTrace++]);
|
|
}
|
|
return false;
|
|
}
|
|
s_n_backing = new_array_size;
|
|
s_trace = r;
|
|
}
|
|
if (track_realloc) {
|
|
collectStats("(trace buffer realloc)", s_trace[nTrace++]);
|
|
collectStats(NULL, s_trace[nTrace++]);
|
|
}
|
|
return true;
|
|
}
|
|
|
|
static pthread_mutex_t s_in_use;
|
|
|
|
class CountMap : public TraceData {
|
|
public:
|
|
int64_t count;
|
|
CountMap() : count(0) { clear(); }
|
|
};
|
|
typedef hphp_hash_map<std::string, CountMap, string_hash> StatsMap;
|
|
StatsMap m_stats; // outcome
|
|
|
|
explicit TraceProfiler(int flags)
|
|
: Profiler(), nTrace(0), full(false), m_flags(flags) {
|
|
if (pthread_mutex_trylock(&s_in_use)) {
|
|
m_successful = false;
|
|
} else {
|
|
char buf[20];
|
|
sprintf(buf, "%d", RuntimeOption::ProfilerMaxTraceBuffer);
|
|
IniSetting::Bind("profiler.max_trace_buffer", buf,
|
|
ini_on_update_long, &maxTraceBuffer);
|
|
}
|
|
overflowCalls = 0;
|
|
nTrace = 0;
|
|
}
|
|
|
|
~TraceProfiler() {
|
|
if (m_successful) {
|
|
free(s_trace);
|
|
nTrace = 0;
|
|
s_trace = NULL;
|
|
s_n_backing = 0;
|
|
pthread_mutex_unlock(&s_in_use);
|
|
IniSetting::Unbind("profiler.max_trace_buffer");
|
|
}
|
|
}
|
|
|
|
virtual void beginFrame(const char *symbol) {
|
|
doTrace(symbol);
|
|
}
|
|
|
|
virtual void endFrame(bool endMain = false) {
|
|
doTrace(NULL);
|
|
}
|
|
|
|
virtual void endAllFrames() {
|
|
Profiler::endAllFrames();
|
|
if (s_trace && nTrace < s_n_backing - 1) {
|
|
collectStats(NULL, s_trace[nTrace++]);
|
|
full = true;
|
|
}
|
|
}
|
|
|
|
void collectStats(const char *symbol, TraceEntry& te) {
|
|
te.symbol.ptr = symbol;
|
|
collectStats((TraceData&)te);
|
|
}
|
|
|
|
void collectStats(TraceData& te) {
|
|
te.wall_time = tsc();
|
|
te.cpu = 0;
|
|
if (m_flags & TrackCPU) {
|
|
te.cpu = vtsc(m_MHz);
|
|
}
|
|
if (m_flags & TrackMemory) {
|
|
MemoryManager *mm = MemoryManager::TheMemoryManager();
|
|
const MemoryUsageStats &stats = mm->getStats(true);
|
|
te.memory = stats.usage;
|
|
te.peak_memory = stats.peakUsage;
|
|
} else if (m_flags & TrackMalloc) {
|
|
te.memory = get_allocs();
|
|
te.peak_memory = get_frees();
|
|
} else {
|
|
te.memory = 0;
|
|
te.peak_memory = 0;
|
|
}
|
|
}
|
|
|
|
TraceEntry* next_trace() {
|
|
if (!trace_space_available() && !trace_get_space()) {
|
|
return 0;
|
|
}
|
|
return &s_trace[nTrace++];
|
|
}
|
|
|
|
void doTrace(const char *symbol) {
|
|
TraceEntry *te = next_trace();
|
|
if (te != NULL) {
|
|
collectStats(symbol, *te);
|
|
}
|
|
}
|
|
|
|
virtual void writeStats(Array &ret) {
|
|
std::map<const char *, unsigned>fmap;
|
|
TraceData my_begin;
|
|
collectStats(my_begin);
|
|
walkTrace(s_trace, s_trace + nTrace, m_stats, fmap);
|
|
if (overflowCalls) {
|
|
m_stats["(trace buffer terminated)"].count += overflowCalls/2;
|
|
}
|
|
extractStats(ret, m_stats, m_flags, m_MHz);
|
|
if (m_flags & GetTrace) {
|
|
String traceData;
|
|
packTraceData(fmap, traceData, m_MHz);
|
|
ret.set(s_compressed_trace, traceData);
|
|
fprintf(stderr, "%d bytes\n", traceData.size());
|
|
}
|
|
CountMap trace_buffer;
|
|
trace_buffer.count = 0;
|
|
trace_buffer.peak_memory = trace_buffer.memory = nTrace * sizeof(*s_trace);
|
|
returnVals(ret, "(trace buffer alloc)", trace_buffer, m_flags, m_MHz);
|
|
if (m_flags & MeasureXhprofDisable) {
|
|
CountMap my_end;
|
|
collectStats((TraceData&)my_end);
|
|
my_end.count = 1;
|
|
my_end.cpu -= my_begin.cpu;
|
|
my_end.wall_time -= my_begin.wall_time;
|
|
my_end.memory -= my_begin.memory;
|
|
my_end.peak_memory -= my_begin.peak_memory;
|
|
returnVals(ret, "xhprof_post_processing()", my_end, m_flags, m_MHz);
|
|
}
|
|
}
|
|
|
|
void extendBuffer(z_stream &strm, char *&buff, int &size) {
|
|
int old_size = size;
|
|
size *= 1.1;
|
|
buff = (char *)realloc(buff, size);
|
|
strm.next_out = (Bytef*)(buff + old_size);
|
|
strm.avail_out = size - old_size;
|
|
}
|
|
|
|
void deflater(z_stream &strm, int mode, char *&buff, int &size) {
|
|
while (deflate(&strm, Z_FULL_FLUSH),
|
|
strm.avail_out == 0)
|
|
{
|
|
extendBuffer(strm, buff, size);
|
|
}
|
|
}
|
|
|
|
template<class fm>
|
|
void packTraceData(fm &fmap, String& traceData, int MHz) {
|
|
typename fm::iterator fmIt;
|
|
int symbol_len = 0;
|
|
|
|
char speed_buff[50];
|
|
sprintf(speed_buff, xhprof_trace_speed, MHz);
|
|
int speed_len = strlen(speed_buff);
|
|
|
|
// size the buffer for holding function names
|
|
for (fmIt = fmap.begin(); fmIt != fmap.end(); ++fmIt) {
|
|
// record the length of each name
|
|
fmIt->second = strlen(fmIt->first) + 1;
|
|
symbol_len += fmIt->second;
|
|
}
|
|
++symbol_len;
|
|
char namebuff[symbol_len];
|
|
char *cp = namebuff;
|
|
int index = 0;
|
|
|
|
// copy function names, and give each function an index number
|
|
for (fmIt = fmap.begin(); fmIt != fmap.end(); ++fmIt) {
|
|
int len = fmIt->second;
|
|
memcpy((void *)cp, (void *)fmIt->first, len);
|
|
cp += len;
|
|
// this index will go in the trace
|
|
fmIt->second = index++;
|
|
}
|
|
*cp = '\0'; // an extra null
|
|
// map null strings to -1
|
|
fmap[NULL] = (typename fm::mapped_type)-1;
|
|
|
|
TraceEntry *te = s_trace;
|
|
TraceEntry *end = s_trace + nTrace;
|
|
|
|
while (te != end) {
|
|
te->symbol.index = fmap[te->symbol.ptr];
|
|
te->rebase(*s_trace);
|
|
++te;
|
|
}
|
|
|
|
// compress, starting with the list of functions
|
|
z_stream strm;
|
|
int ret;
|
|
|
|
strm.zalloc = Z_NULL;
|
|
strm.zfree = Z_NULL;
|
|
strm.opaque = Z_NULL;
|
|
ret = deflateInit(&strm, 3);
|
|
if (ret != Z_OK) {
|
|
return;
|
|
}
|
|
|
|
int trace_size = nTrace * sizeof(TraceEntry);
|
|
|
|
int dp_size = symbol_len/3 + trace_size/5;
|
|
char *dp_buff = (char*) malloc(dp_size);
|
|
|
|
sprintf(dp_buff, xhprof_trace_header, speed_len + symbol_len + trace_size);
|
|
strm.next_out = (Bytef*)strchr(dp_buff, 0);
|
|
strm.avail_out = dp_size - ((char *)strm.next_out - dp_buff);
|
|
|
|
strm.next_in = (Bytef*)speed_buff;
|
|
strm.avail_in = speed_len;
|
|
deflater(strm, Z_NO_FLUSH, dp_buff, dp_size);
|
|
|
|
strm.next_in = (Bytef*)namebuff;
|
|
strm.avail_in = symbol_len;
|
|
deflater(strm, Z_FULL_FLUSH, dp_buff, dp_size);
|
|
|
|
strm.next_in = (Bytef*)s_trace;
|
|
strm.avail_in = nTrace * sizeof(TraceEntry);
|
|
deflater(strm, Z_NO_FLUSH, dp_buff, dp_size);
|
|
|
|
while ((ret = deflate(&strm, Z_FINISH)) != Z_STREAM_END)
|
|
{
|
|
extendBuffer(strm, dp_buff, dp_size);
|
|
}
|
|
traceData = String(dp_buff, dp_size - strm.avail_out, AttachString);
|
|
nTrace = 0;
|
|
}
|
|
|
|
static String
|
|
unpackTraceData(CStrRef packedTrace) {
|
|
const char *input = packedTrace.c_str();
|
|
int64_t input_length = packedTrace.size();
|
|
|
|
int output_length;
|
|
char *output = 0;
|
|
String s;
|
|
|
|
if (sscanf(input, xhprof_trace_header, &output_length) != 1) {
|
|
return empty_string;
|
|
}
|
|
const char *zipped_begin;
|
|
if (!(zipped_begin = strchr(input, '\n'))
|
|
|| !(zipped_begin = strchr(zipped_begin + 1, '\n'))) {
|
|
goto error2_out;
|
|
}
|
|
++zipped_begin;
|
|
|
|
int ret;
|
|
z_stream strm;
|
|
|
|
strm.zalloc = Z_NULL;
|
|
strm.zfree = Z_NULL;
|
|
strm.opaque = Z_NULL;
|
|
strm.avail_in = 0;
|
|
strm.next_in = Z_NULL;
|
|
ret = inflateInit(&strm);
|
|
if (ret != Z_OK) {
|
|
goto error2_out;
|
|
}
|
|
|
|
s = String(output_length, ReserveString);
|
|
output = s.mutableSlice().ptr;
|
|
|
|
strm.avail_in = input_length - (zipped_begin - input);
|
|
strm.next_in = (Bytef*)zipped_begin;
|
|
strm.avail_out = output_length;
|
|
strm.next_out = (Bytef*)output;
|
|
switch (inflate(&strm, Z_NO_FLUSH)) {
|
|
case Z_NEED_DICT:
|
|
case Z_DATA_ERROR:
|
|
case Z_MEM_ERROR:
|
|
goto error_out;
|
|
}
|
|
if (strm.avail_out) {
|
|
goto error_out;
|
|
}
|
|
inflateEnd(&strm);
|
|
|
|
return s.setSize(output_length);
|
|
error_out:
|
|
free(output);
|
|
inflateEnd(&strm);
|
|
error2_out:
|
|
return empty_string;
|
|
}
|
|
|
|
|
|
private:
|
|
uint32_t m_flags;
|
|
};
|
|
|
|
TraceEntry *TraceProfiler::s_trace = NULL;
|
|
int TraceProfiler::s_n_backing = 0;
|
|
pthread_mutex_t TraceProfiler::s_in_use = PTHREAD_MUTEX_INITIALIZER;
|
|
|
|
///////////////////////////////////////////////////////////////////////////////
|
|
// SampleProfiler
|
|
|
|
/**
|
|
* Sampling based profiler.
|
|
*/
|
|
class SampleProfiler : public Profiler {
|
|
private:
|
|
typedef hphp_hash_map<std::string, int64_t, string_hash> CountMap;
|
|
typedef hphp_hash_map<std::string, CountMap, string_hash> StatsMap;
|
|
StatsMap m_stats; // outcome
|
|
|
|
public:
|
|
SampleProfiler() {
|
|
struct timeval now;
|
|
uint64_t truncated_us;
|
|
uint64_t truncated_tsc;
|
|
|
|
// Init the last_sample in tsc
|
|
m_last_sample_tsc = tsc();
|
|
|
|
// Find the microseconds that need to be truncated
|
|
gettimeofday(&m_last_sample_time, 0);
|
|
now = m_last_sample_time;
|
|
hp_trunc_time(&m_last_sample_time, SAMPLING_INTERVAL);
|
|
|
|
// Subtract truncated time from last_sample_tsc
|
|
truncated_us = get_us_interval(&m_last_sample_time, &now);
|
|
truncated_tsc = truncated_us * m_MHz;
|
|
if (m_last_sample_tsc > truncated_tsc) {
|
|
// just to be safe while subtracting unsigned ints
|
|
m_last_sample_tsc -= truncated_tsc;
|
|
}
|
|
|
|
// Convert sampling interval to ticks
|
|
m_sampling_interval_tsc = SAMPLING_INTERVAL * m_MHz;
|
|
}
|
|
|
|
virtual void beginFrameEx() {
|
|
sample_check();
|
|
}
|
|
|
|
virtual void endFrameEx() {
|
|
sample_check();
|
|
}
|
|
|
|
virtual void writeStats(Array &ret) {
|
|
for (StatsMap::const_iterator iter = m_stats.begin();
|
|
iter != m_stats.end(); ++iter) {
|
|
Array arr;
|
|
const CountMap &counts = iter->second;
|
|
for (CountMap::const_iterator iterCount = counts.begin();
|
|
iterCount != counts.end(); ++iterCount) {
|
|
arr.set(String(iterCount->first), iterCount->second);
|
|
}
|
|
ret.set(String(iter->first), arr);
|
|
}
|
|
}
|
|
|
|
private:
|
|
static const int SAMPLING_INTERVAL = 100000; // microsecs
|
|
|
|
struct timeval m_last_sample_time;
|
|
uint64_t m_last_sample_tsc;
|
|
uint64_t m_sampling_interval_tsc;
|
|
|
|
/**
|
|
* Sample the stack. Add it to the stats_count global.
|
|
*
|
|
* @param tv current time
|
|
* @param entries func stack as linked list of hprof_entry_t
|
|
* @return void
|
|
* @author veeve
|
|
*/
|
|
void sample_stack() {
|
|
char key[512];
|
|
snprintf(key, sizeof(key), "%ld.%06ld",
|
|
m_last_sample_time.tv_sec, m_last_sample_time.tv_usec);
|
|
|
|
char symbol[5120];
|
|
m_stack->getStack(INT_MAX, symbol, sizeof(symbol));
|
|
m_stats[key][symbol] = 1;
|
|
}
|
|
|
|
/**
|
|
* Checks to see if it is time to sample the stack.
|
|
* Calls hp_sample_stack() if its time.
|
|
*
|
|
* @param entries func stack as linked list of hprof_entry_t
|
|
* @param last_sample time the last sample was taken
|
|
* @param sampling_intr sampling interval in microsecs
|
|
* @return void
|
|
* @author veeve
|
|
*/
|
|
void sample_check() {
|
|
if (m_stack) {
|
|
// While loop is to handle a single function taking a long time
|
|
// and passing several sampling intervals
|
|
while ((tsc() - m_last_sample_tsc) > m_sampling_interval_tsc) {
|
|
m_last_sample_tsc += m_sampling_interval_tsc;
|
|
// HAS TO BE UPDATED BEFORE calling sample_stack
|
|
incr_us_interval(&m_last_sample_time, SAMPLING_INTERVAL);
|
|
sample_stack();
|
|
}
|
|
}
|
|
}
|
|
};
|
|
|
|
///////////////////////////////////////////////////////////////////////////////
|
|
|
|
class ProfilerFactory : public RequestEventHandler {
|
|
public:
|
|
enum Level {
|
|
Simple = 1,
|
|
Hierarchical = 2,
|
|
Memory = 3,
|
|
Trace = 4,
|
|
Sample = 620002, // Rockfort's zip code
|
|
};
|
|
|
|
static bool EnableNetworkProfiler;
|
|
|
|
public:
|
|
ProfilerFactory() : m_profiler(NULL) {
|
|
}
|
|
|
|
~ProfilerFactory() {
|
|
stop();
|
|
}
|
|
|
|
Profiler *getProfiler() {
|
|
return m_profiler;
|
|
}
|
|
|
|
virtual void requestInit() {
|
|
}
|
|
|
|
virtual void requestShutdown() {
|
|
stop();
|
|
m_artificialFrameNames.reset();
|
|
}
|
|
|
|
void start(Level level, long flags) {
|
|
if (!RuntimeOption::EnableHotProfiler) {
|
|
return;
|
|
}
|
|
HPHP::EventHook::Enable();
|
|
if (m_profiler == NULL) {
|
|
switch (level) {
|
|
case Simple:
|
|
m_profiler = new SimpleProfiler();
|
|
break;
|
|
case Hierarchical:
|
|
m_profiler = new HierarchicalProfiler(flags);
|
|
break;
|
|
case Sample:
|
|
m_profiler = new SampleProfiler();
|
|
break;
|
|
case Trace:
|
|
m_profiler = new TraceProfiler(flags);
|
|
break;
|
|
default:
|
|
throw_invalid_argument("level: %d", level);
|
|
return;
|
|
}
|
|
if (m_profiler->m_successful) {
|
|
m_profiler->beginFrame("main()");
|
|
ThreadInfo::s_threadInfo->m_profiler = m_profiler;
|
|
} else {
|
|
delete m_profiler;
|
|
m_profiler = NULL;
|
|
}
|
|
}
|
|
}
|
|
|
|
Variant stop() {
|
|
if (m_profiler) {
|
|
m_profiler->endAllFrames();
|
|
|
|
Array ret;
|
|
m_profiler->writeStats(ret);
|
|
delete m_profiler;
|
|
m_profiler = NULL;
|
|
ThreadInfo::s_threadInfo->m_profiler = NULL;
|
|
|
|
return ret;
|
|
}
|
|
return uninit_null();
|
|
}
|
|
|
|
/**
|
|
* The whole purpose to make sure "const char *" is safe to take on these
|
|
* strings.
|
|
*/
|
|
void cacheString(CStrRef name) {
|
|
m_artificialFrameNames.append(name);
|
|
}
|
|
|
|
private:
|
|
Profiler *m_profiler;
|
|
Array m_artificialFrameNames;
|
|
};
|
|
|
|
bool ProfilerFactory::EnableNetworkProfiler = false;
|
|
|
|
#ifdef HOTPROFILER
|
|
IMPLEMENT_STATIC_REQUEST_LOCAL(ProfilerFactory, s_factory);
|
|
#endif
|
|
|
|
///////////////////////////////////////////////////////////////////////////////
|
|
// main functions
|
|
|
|
void f_hotprofiler_enable(int level) {
|
|
#ifdef HOTPROFILER
|
|
long flags = 0;
|
|
if (level == ProfilerFactory::Hierarchical) {
|
|
flags = TrackBuiltins;
|
|
} else if (level == ProfilerFactory::Memory) {
|
|
level = ProfilerFactory::Hierarchical;
|
|
flags = TrackBuiltins | TrackMemory;
|
|
}
|
|
s_factory->start((ProfilerFactory::Level)level, flags);
|
|
#endif
|
|
}
|
|
|
|
Variant f_hotprofiler_disable() {
|
|
#ifdef HOTPROFILER
|
|
return s_factory->stop();
|
|
#else
|
|
return uninit_null();
|
|
#endif
|
|
}
|
|
|
|
void f_phprof_enable(int flags /* = 0 */) {
|
|
#ifdef HOTPROFILER
|
|
s_factory->start(ProfilerFactory::Hierarchical, flags);
|
|
#endif
|
|
}
|
|
|
|
Variant f_phprof_disable() {
|
|
#ifdef HOTPROFILER
|
|
return s_factory->stop();
|
|
#else
|
|
return uninit_null();
|
|
#endif
|
|
}
|
|
|
|
void f_fb_setprofile(CVarRef callback) {
|
|
#ifdef HOTPROFILER
|
|
if (ThreadInfo::s_threadInfo->m_profiler != NULL) {
|
|
// phpprof is enabled, don't let PHP code override it
|
|
return;
|
|
}
|
|
#endif
|
|
g_vmContext->m_setprofileCallback = callback;
|
|
if (callback.isNull()) {
|
|
HPHP::EventHook::Disable();
|
|
} else {
|
|
HPHP::EventHook::Enable();
|
|
}
|
|
}
|
|
|
|
void f_xhprof_frame_begin(CStrRef name) {
|
|
#ifdef HOTPROFILER
|
|
Profiler *prof = ThreadInfo::s_threadInfo->m_profiler;
|
|
if (prof) {
|
|
s_factory->cacheString(name);
|
|
prof->beginFrame(name.data());
|
|
}
|
|
#endif
|
|
}
|
|
|
|
void f_xhprof_frame_end() {
|
|
#ifdef HOTPROFILER
|
|
Profiler *prof = ThreadInfo::s_threadInfo->m_profiler;
|
|
if (prof) {
|
|
prof->endFrame();
|
|
}
|
|
#endif
|
|
}
|
|
|
|
void f_xhprof_enable(int flags/* = 0 */,
|
|
CArrRef args /* = null_array */) {
|
|
#ifdef HOTPROFILER
|
|
if (vtsc_syscall.num <= 0 &&
|
|
Util::Vdso::ClockGetTimeNS(CLOCK_THREAD_CPUTIME_ID) == -1) {
|
|
flags &= ~TrackVtsc;
|
|
}
|
|
if (flags & TrackVtsc) {
|
|
flags |= TrackCPU;
|
|
}
|
|
if (flags & XhpTrace) {
|
|
s_factory->start(ProfilerFactory::Trace, flags);
|
|
} else {
|
|
s_factory->start(ProfilerFactory::Hierarchical, flags);
|
|
}
|
|
#endif
|
|
}
|
|
|
|
Variant f_xhprof_disable() {
|
|
#ifdef HOTPROFILER
|
|
return s_factory->stop();
|
|
#else
|
|
return uninit_null();
|
|
#endif
|
|
}
|
|
|
|
void f_xhprof_network_enable() {
|
|
ServerStats::StartNetworkProfile();
|
|
}
|
|
|
|
Variant f_xhprof_network_disable() {
|
|
return ServerStats::EndNetworkProfile();
|
|
}
|
|
|
|
void f_xhprof_sample_enable() {
|
|
#ifdef HOTPROFILER
|
|
s_factory->start(ProfilerFactory::Sample, 0);
|
|
#endif
|
|
}
|
|
|
|
Variant f_xhprof_sample_disable() {
|
|
#ifdef HOTPROFILER
|
|
return s_factory->stop();
|
|
#else
|
|
return uninit_null();
|
|
#endif
|
|
}
|
|
|
|
Variant f_xhprof_run_trace(CStrRef packedTrace, int flags) {
|
|
#ifdef HOTPROFILER
|
|
|
|
String traceData = TraceProfiler::unpackTraceData(packedTrace);
|
|
|
|
// we really don't want to copy this
|
|
char *syms = const_cast<char*>(traceData.c_str());
|
|
|
|
int MHz;
|
|
if (sscanf(syms, xhprof_trace_speed, &MHz) != 1) {
|
|
return uninit_null();
|
|
}
|
|
|
|
vector<char *>symbols;
|
|
char *sym = strchr(syms, '\n') + 1;
|
|
char *esym = strchr(sym, '\0');
|
|
while (sym != esym) {
|
|
symbols.push_back(sym);
|
|
sym = esym + 1;
|
|
esym = strchr(sym, '\0');
|
|
}
|
|
|
|
TraceEntry *begin = (TraceEntry*)(esym + 1);
|
|
TraceEntry *end = (TraceEntry*)(syms + traceData.size());
|
|
|
|
for (TraceEntry *toup = begin; toup != end; ++toup) {
|
|
int symIndex = toup->symbol.index;
|
|
if (symIndex >= 0 && (uint64_t)symIndex < symbols.size()) {
|
|
toup->symbol.ptr = symbols[toup->symbol.index];
|
|
} else if (symIndex == -1) {
|
|
toup->symbol.ptr = NULL;
|
|
} else {
|
|
// corrupt trace
|
|
return uninit_null();
|
|
}
|
|
}
|
|
|
|
std::map<const char *, unsigned>fmap;
|
|
Array result;
|
|
TraceProfiler::StatsMap stats;
|
|
walkTrace(&*begin, &*end, stats, fmap);
|
|
Profiler::extractStats(result, stats, flags, MHz);
|
|
return result;
|
|
#else
|
|
return uninit_null();
|
|
#endif
|
|
}
|
|
|
|
///////////////////////////////////////////////////////////////////////////////
|
|
// constants
|
|
const int64_t k_XHPROF_FLAGS_NO_BUILTINS = TrackBuiltins;
|
|
const int64_t k_XHPROF_FLAGS_CPU = TrackCPU;
|
|
const int64_t k_XHPROF_FLAGS_MEMORY = TrackMemory;
|
|
const int64_t k_XHPROF_FLAGS_VTSC = TrackVtsc;
|
|
const int64_t k_XHPROF_FLAGS_TRACE = XhpTrace;
|
|
const int64_t k_XHPROF_FLAGS_MEASURE_XHPROF_DISABLE = MeasureXhprofDisable;
|
|
const int64_t k_XHPROF_FLAGS_GET_TRACE = GetTrace;
|
|
const int64_t k_XHPROF_FLAGS_MALLOC = TrackMalloc;
|
|
|
|
///////////////////////////////////////////////////////////////////////////////
|
|
// injected code
|
|
|
|
void begin_profiler_frame(Profiler *p, const char *symbol) {
|
|
p->beginFrame(symbol);
|
|
}
|
|
|
|
void end_profiler_frame(Profiler *p) {
|
|
p->endFrame();
|
|
}
|
|
|
|
///////////////////////////////////////////////////////////////////////////////
|
|
}
|