/* +----------------------------------------------------------------------+ | HipHop for PHP | +----------------------------------------------------------------------+ | Copyright (c) 2010-2013 Facebook, Inc. (http://www.facebook.com) | | Copyright (c) 1997-2010 The PHP Group | +----------------------------------------------------------------------+ | This source file is subject to version 3.01 of the PHP license, | | that is bundled with this package in the file LICENSE, and is | | available through the world-wide-web at the following url: | | http://www.php.net/license/3_01.txt | | If you did not receive a copy of the PHP license and are unable to | | obtain it through the world-wide-web, please send a note to | | license@php.net so we can mail you a copy immediately. | +----------------------------------------------------------------------+ */ #include "hphp/runtime/ext/ext_fb.h" #include "hphp/runtime/base/memory_manager.h" #include "hphp/runtime/base/request_local.h" #include "hphp/runtime/base/zend_math.h" #include "hphp/runtime/server/server_stats.h" #include "hphp/runtime/base/ini_setting.h" #include "hphp/runtime/vm/event_hook.h" #include "hphp/util/alloc.h" #include "hphp/util/vdso.h" #include "hphp/util/cycles.h" #ifdef __FreeBSD__ # include # include # define cpu_set_t cpuset_t # define SET_AFFINITY(pid, size, mask) \ cpuset_setaffinity(CPU_LEVEL_WHICH, CPU_WHICH_TID, -1, size, mask) # define GET_AFFINITY(pid, size, mask) \ cpuset_getaffinity(CPU_LEVEL_WHICH, CPU_WHICH_TID, -1, size, mask) #elif __APPLE__ # include # include # include # define cpu_set_t thread_affinity_policy_data_t # define CPU_SET(cpu_id, new_mask) \ (*(new_mask)).affinity_tag = (cpu_id + 1) # define CPU_ZERO(new_mask) \ (*(new_mask)).affinity_tag = THREAD_AFFINITY_TAG_NULL # define GET_AFFINITY(pid, size, mask) \ (*(mask)).affinity_tag = THREAD_AFFINITY_TAG_NULL # define SET_AFFINITY(pid, size, mask) \ thread_policy_set(mach_thread_self(), THREAD_AFFINITY_POLICY, \ (int *)mask, THREAD_AFFINITY_POLICY_COUNT) #else # include # define SET_AFFINITY(pid, size, mask) sched_setaffinity(0, size, mask) # define GET_AFFINITY(pid, size, mask) sched_getaffinity(0, size, mask) #endif #include #include #include // Append the delimiter #define HP_STACK_DELIM "==>" #define HP_STACK_DELIM_LEN (sizeof(HP_STACK_DELIM) - 1) namespace HPHP { IMPLEMENT_DEFAULT_EXTENSION(hotprofiler); IMPLEMENT_DEFAULT_EXTENSION(xhprof); /////////////////////////////////////////////////////////////////////////////// // helpers /* * A hash function to calculate a 8-bit hash code for a function name. * This is based on a small modification to 'zend_inline_hash_func' by summing * up all bytes of the ulong returned by 'zend_inline_hash_func'. * * @param str, char *, string to be calculated hash code for. * * @author cjiang */ static inline uint8_t hprof_inline_hash(const char * str) { unsigned long h = 5381; uint i = 0; uint8_t res = 0; while (*str) { h += (h << 5); h ^= (unsigned long) *str++; } for (i = 0; i < sizeof(unsigned long); i++) { res += ((uint8_t *)&h)[i]; } return res; } /** * Get time delta in microseconds. */ static long get_us_interval(struct timeval *start, struct timeval *end) { return (((end->tv_sec - start->tv_sec) * 1000000) + (end->tv_usec - start->tv_usec)); } /** * Incr time with the given microseconds. */ static void incr_us_interval(struct timeval *start, uint64_t incr) { incr += (start->tv_sec * 1000000 + start->tv_usec); start->tv_sec = incr/1000000; start->tv_usec = incr%1000000; return; } /** * Truncates the given timeval to the nearest slot begin, where * the slot size is determined by intr * * @param tv Input timeval to be truncated in place * @param intr Time interval in microsecs - slot width * @return void * @author veeve */ static void hp_trunc_time(struct timeval *tv, uint64_t intr) { uint64_t time_in_micro; // Convert to microsecs and trunc that first time_in_micro = (tv->tv_sec * 1000000) + tv->tv_usec; time_in_micro /= intr; time_in_micro *= intr; // Update tv tv->tv_sec = (time_in_micro / 1000000); tv->tv_usec = (time_in_micro % 1000000); } /////////////////////////////////////////////////////////////////////////////// // High precision timer related functions. /** * This is a microbenchmark to get cpu frequency the process is running on. The * returned value is used to convert TSC counter values to microseconds. * * @return int64. * @author cjiang */ static int64_t get_cpu_frequency() { struct timeval start; struct timeval end; if (gettimeofday(&start, 0)) { perror("gettimeofday"); return 0.0; } uint64_t tsc_start = cpuCycles(); // Sleep for 5 miliseconds. Comparaing with gettimeofday's few microseconds // execution time, this should be enough. usleep(5000); if (gettimeofday(&end, 0)) { perror("gettimeofday"); return 0.0; } uint64_t tsc_end = cpuCycles(); return nearbyint((tsc_end - tsc_start) * 1.0 / (get_us_interval(&start, &end))); } #define MAX_LINELENGTH 1024 static int64_t* get_cpu_frequency_from_file(const char *file, int ncpus) { std::ifstream cpuinfo(file); if (cpuinfo.fail()) { return NULL; } char line[MAX_LINELENGTH]; int64_t* freqs = new int64_t[ncpus]; for (int i = 0; i < ncpus; ++i) { freqs[i] = 0; } int processor = -1; while (cpuinfo.getline(line, sizeof(line))) { if (sscanf(line, "processor : %d", &processor) == 1) { continue; } float freq; if (sscanf(line, "cpu MHz : %f", &freq) == 1) { if (processor != -1 && processor < ncpus) { freqs[processor] = nearbyint(freq); processor = -1; } } } for (int i = 0; i < ncpus; ++i) { if (freqs[i] == 0) { delete[] freqs; return NULL; } } return freqs; } class esyscall { public: int num; explicit esyscall(const char *syscall_name) { num = -1; char format[strlen(syscall_name) + sizeof(" %d")]; sprintf(format, "%s %%d", syscall_name); std::ifstream syscalls("/proc/esyscall"); if (syscalls.fail()) { return; } char line[MAX_LINELENGTH]; if (!syscalls.getline(line, sizeof(line))) { return; } // perhaps we should check the format, but we're just going to assume // Name Number while (syscalls.getline(line, sizeof(line))) { int number; if (sscanf(line, format, &number) == 1) { num = number; return; } } } }; /////////////////////////////////////////////////////////////////////////////// // Machine information that we collect just once. class MachineInfo { public: /** * Bind the current process to a specified CPU. This function is to ensure * that the OS won't schedule the process to different processors, which * would make values read by rdtsc unreliable. * * @param uint32 cpu_id, the id of the logical cpu to be bound to. * * @author cjiang */ static void BindToCPU(uint32_t cpu_id) { cpu_set_t new_mask; CPU_ZERO(&new_mask); CPU_SET(cpu_id, &new_mask); SET_AFFINITY(0, sizeof(cpu_set_t), &new_mask); } public: // The number of logical CPUs this machine has. int m_cpu_num; // Store the cpu frequency. Get it from /proc/cpuinfo if we can. int64_t* m_cpu_frequencies; MachineInfo() { m_cpu_num = sysconf(_SC_NPROCESSORS_CONF); m_cpu_frequencies = get_cpu_frequency_from_file("/proc/cpuinfo", m_cpu_num); if (m_cpu_frequencies) return; m_cpu_frequencies = new int64_t[m_cpu_num]; for (int i = 0; i < m_cpu_num; i++) { cpu_set_t prev_mask; GET_AFFINITY(0, sizeof(cpu_set_t), &prev_mask); BindToCPU(i); // Make sure the current process gets scheduled to the target cpu. This // might not be necessary though. usleep(0); m_cpu_frequencies[i] = get_cpu_frequency(); SET_AFFINITY(0, sizeof(cpu_set_t), &prev_mask); } } ~MachineInfo() { delete[] m_cpu_frequencies; } }; static MachineInfo s_machine; static inline uint64_t tv_to_cycles(const struct timeval& tv, int64_t MHz) { return (((uint64_t)tv.tv_sec * 1000000) + tv.tv_usec) * MHz; } static inline uint64_t to_usec(int64_t cycles, int64_t MHz, bool cpu_time = false) { #ifdef CLOCK_THREAD_CPUTIME_ID static int64_t vdso_usable = Util::Vdso::ClockGetTimeNS(CLOCK_THREAD_CPUTIME_ID); #else static int64_t vdso_usable = -1; #endif if (cpu_time && vdso_usable >= 0) return cycles / 1000; return (cycles + MHz/2) / MHz; } static esyscall vtsc_syscall("vtsc"); static inline uint64_t vtsc(int64_t MHz) { #ifdef CLOCK_THREAD_CPUTIME_ID int64_t rval = Util::Vdso::ClockGetTimeNS(CLOCK_THREAD_CPUTIME_ID); if (rval >= 0) { return rval; } #endif if (vtsc_syscall.num > 0) { return syscall(vtsc_syscall.num); } struct rusage usage; getrusage(RUSAGE_SELF, &usage); return tv_to_cycles(usage.ru_utime, MHz) + tv_to_cycles(usage.ru_stime, MHz); } #ifdef USE_JEMALLOC #define JEMALLOC_STAT_MIB_LEN 2 size_t mallctl_mib_len = JEMALLOC_STAT_MIB_LEN; size_t *mallctl_alloc_mib; size_t *mallctl_free_mib; bool mallctl_init = false; bool mallctl_mib_init() { if (mallctl_init) { return false; } mallctl_init = true; mallctl_alloc_mib = new size_t[mallctl_mib_len]; if (mallctlnametomib("thread.allocated", mallctl_alloc_mib, &mallctl_mib_len)) { return false; } mallctl_free_mib = new size_t[mallctl_mib_len]; if (mallctlnametomib("thread.deallocated", mallctl_free_mib, &mallctl_mib_len)) { return false; } return true; } #endif uint64_t get_allocs() { #ifdef USE_JEMALLOC if (mallctl) { if (!mallctl_alloc_mib) { if (!mallctl_mib_init()) { return 0; } } uint64_t stat; size_t size = sizeof(stat); if (mallctlbymib(mallctl_alloc_mib, mallctl_mib_len, &stat, &size, NULL, 0)) { return 0; } return stat; } #endif #ifdef USE_TCMALLOC if (MallocExtensionInstance) { size_t stat; MallocExtensionInstance()->GetNumericProperty( "generic.thread_bytes_allocated", &stat); return stat; } #endif return 0; } uint64_t get_frees() { #ifdef USE_JEMALLOC if (mallctl) { if (!mallctl_free_mib) { if (!mallctl_mib_init()) { return 0; } } uint64_t stat; size_t size = sizeof(stat); if (mallctlbymib(mallctl_free_mib, mallctl_mib_len, &stat, &size, NULL, 0)) { return 0; } return stat; } #endif #ifdef USE_TCMALLOC if (MallocExtensionInstance) { size_t stat; MallocExtensionInstance()->GetNumericProperty( "generic.thread_bytes_freed", &stat); return stat; } #endif return 0; } /////////////////////////////////////////////////////////////////////////////// // classes /** * All information we collect about a frame. */ class Frame { public: Frame *m_parent; // ptr to parent frame const char *m_name; // function name uint8_t m_hash_code; // hash_code for the function name int m_recursion; // recursion level for function uint64_t m_tsc_start; // start value for TSC counter int64_t m_mu_start; // memory usage int64_t m_pmu_start; // peak memory usage int64_t m_vtsc_start; // user/sys time start /** * Returns formatted function name * * @param result_buf ptr to result buf * @param result_len max size of result buf * @return total size of the function name returned in result_buf * @author veeve */ size_t getName(char *result_buf, size_t result_len) { if (result_len <= 1) { return 0; // Insufficient result_bug. Bail! } // Add '@recurse_level' if required // NOTE: Dont use snprintf's return val as it is compiler dependent if (m_recursion) { snprintf(result_buf, result_len, "%s@%d", m_name, m_recursion); } else { snprintf(result_buf, result_len, "%s", m_name); } // Force null-termination at MAX result_buf[result_len - 1] = 0; return strlen(result_buf); } /** * Build a caller qualified name for a callee. * * For example, if A() is caller for B(), then it returns "A==>B". * Recursive invokations are denoted with @ where n is the recursion * depth. * * For example, "foo==>foo@1", and "foo@2==>foo@3" are examples of direct * recursion. And "bar==>foo@1" is an example of an indirect recursive * call to foo (implying the foo() is on the call stack some levels * above). */ size_t getStack(int level, char *result_buf, size_t result_len) { // End recursion if we dont need deeper levels or // we dont have any deeper levels if (!m_parent || level <= 1) { return getName(result_buf, result_len); } // Take care of all ancestors first size_t len = m_parent->getStack(level - 1, result_buf, result_len); if (result_len < (len + HP_STACK_DELIM_LEN)) { return len; // Insufficient result_buf. Bail out! } // Add delimiter only if entry had ancestors if (len) { strncat(result_buf + len, HP_STACK_DELIM, result_len - len); len += HP_STACK_DELIM_LEN; } // Append the current function name return len + getName(result_buf + len, result_len - len); } }; enum Flag { TrackBuiltins = 0x1, TrackCPU = 0x2, TrackMemory = 0x4, TrackVtsc = 0x8, XhpTrace = 0x10, MeasureXhprofDisable = 0x20, GetTrace = 0x40, TrackMalloc = 0x80, }; const StaticString s_ct("ct"), s_wt("wt"), s_cpu("cpu"), s_mu("mu"), s_pmu("pmu"), s_alloc("alloc"), s_free("free"), s_compressed_trace("(compressed_trace)"); /** * Maintain profiles of a running stack. */ class Profiler { public: Profiler() : m_successful(true), m_stack(NULL), m_frame_free_list(NULL) { if (!s_rand_initialized) { s_rand_initialized = true; srand(math_generate_seed()); } // bind to a random cpu so that we can use rdtsc instruction. int cur_cpu_id = rand() % s_machine.m_cpu_num; GET_AFFINITY(0, sizeof(cpu_set_t), &m_prev_mask); MachineInfo::BindToCPU(cur_cpu_id); m_MHz = s_machine.m_cpu_frequencies[cur_cpu_id]; memset(m_func_hash_counters, 0, sizeof(m_func_hash_counters)); } virtual ~Profiler() { SET_AFFINITY(0, sizeof(cpu_set_t), &m_prev_mask); endAllFrames(); for (Frame *p = m_frame_free_list; p;) { Frame *cur = p; p = p->m_parent; free(cur); } } /** * Subclass can do extra work by overriding these two virtual functions. */ virtual void beginFrameEx() {} // called right before a function call virtual void endFrameEx() {} // called right after a function is finished /** * Final results. */ virtual void writeStats(Array &ret) {} /** * Start a new frame with the specified symbol. */ virtual void beginFrame(const char *symbol) __attribute__ ((noinline)) ; /** * End top of the stack. */ virtual void endFrame(bool endMain = false) __attribute__ ((noinline)) ; virtual void endAllFrames() { while (m_stack) { endFrame(true); } } template static void returnVals(phpret& ret, const Name& name, const Counts& counts, int flags, int64_t MHz) { ArrayInit arr(5); arr.set(s_ct, counts.count); arr.set(s_wt, to_usec(counts.wall_time, MHz)); if (flags & TrackCPU) { arr.set(s_cpu, to_usec(counts.cpu, MHz, true)); } if (flags & TrackMemory) { arr.set(s_mu, counts.memory); arr.set(s_pmu, counts.peak_memory); } else if (flags & TrackMalloc) { arr.set(s_alloc, counts.memory); arr.set(s_free, counts.peak_memory); } ret.set(String(name), arr.create()); } template static bool extractStats(phpret& ret, StatsMap& stats, int flags, int64_t MHz) { for (typename StatsMap::const_iterator iter = stats.begin(); iter != stats.end(); ++iter) { returnVals(ret, iter->first, iter->second, flags, MHz); } return true; } bool m_successful; int64_t m_MHz; // cpu freq for either the local cpu or the saved trace Frame *m_stack; // top of the profile stack static bool s_rand_initialized; cpu_set_t m_prev_mask; // saved cpu affinity Frame *m_frame_free_list; // freelist of Frame uint8_t m_func_hash_counters[256]; // counter table by hash code; /** * Fast allocate a Frame structure. Picks one from the * free list if available, else does an actual allocate. */ Frame *createFrame(const char *symbol) { Frame *p = m_frame_free_list; if (p) { m_frame_free_list = p->m_parent; } else { p = (Frame*)malloc(sizeof(Frame)); } p->m_parent = m_stack; p->m_name = symbol; p->m_hash_code = hprof_inline_hash(symbol); m_stack = p; return p; } /** * Fast free a Frame structure. Simply returns back the Frame to a free list * and doesn't actually perform the free. */ void releaseFrame() { assert(m_stack); Frame *p = m_stack; m_stack = p->m_parent; p->m_parent = m_frame_free_list; // we overload the m_parent field here m_frame_free_list = p; } }; bool Profiler::s_rand_initialized = false; void Profiler::beginFrame(const char *symbol) { Frame *current = createFrame(symbol); // NOTE(cjiang): use hash code to fend off most of call-stack traversal int recursion_level = 0; if (m_func_hash_counters[current->m_hash_code] > 0) { // Find this symbols recurse level for (Frame *p = current->m_parent; p; p = p->m_parent) { if (strcmp(current->m_name, p->m_name) == 0) { recursion_level = p->m_recursion + 1; break; } } } current->m_recursion = recursion_level; m_func_hash_counters[current->m_hash_code]++; beginFrameEx(); } /** * End top of the stack. */ void Profiler::endFrame(bool endMain) { if (m_stack) { // special case for main() frame that's only ended by endAllFrames() if (!endMain && m_stack->m_parent == NULL) { return; } endFrameEx(); m_func_hash_counters[m_stack->m_hash_code]--; releaseFrame(); } } /////////////////////////////////////////////////////////////////////////////// // SimpleProfiler /** * vtsc() based profiler, but simple enough to print basic information. * * When available, we now use the vtsc() call, which is relatively inexpensive * and accurate. It's still a system call, but a cheap one. If the call isn't * available, the comment below still applies. --renglish * * COMMENT(cjiang): getrusage is very expensive and inaccurate. It is based * on sampling at the rate about once every 5 to 10 miliseconds. The sampling * error can be very significantly, especially given that we are * instrumenting at a very fine granularity. (every PHP function call will * lead to one invokation of getrusage.) Most PHP functions such as the * built-ins typically finish in microseconds. Thus the result we get from * getrusage is very likely going to be skewed. Also worth noting that * getrusage actually is a system call, which involves expensive swapping * between user-mode and kernel mode. I would suggest we remove collecting * CPU usage all together, as exclusive wall-time is very useful already. * Or at least we should make it an opt-in choice. * * See: http://ww2.cs.fsu.edu/~hines/present/timing_linux.pdf * * Above is a nice paper talking about the overhead and the inaccuracy problem * associated with getrusage. */ class SimpleProfiler : public Profiler { private: class CountMap { public: CountMap() : count(0), tsc(0), vtsc(0) {} int64_t count; int64_t tsc; int64_t vtsc; }; typedef hphp_hash_map StatsMap; StatsMap m_stats; // outcome public: SimpleProfiler() { echo("
"); } ~SimpleProfiler() { echo("
"); print_output(); } virtual void beginFrameEx() { m_stack->m_tsc_start = cpuCycles(); m_stack->m_vtsc_start = vtsc(m_MHz); } virtual void endFrameEx() { CountMap &counts = m_stats[m_stack->m_name]; counts.count++; counts.tsc += cpuCycles() - m_stack->m_tsc_start; counts.vtsc += vtsc(m_MHz) - m_stack->m_vtsc_start; } private: void print_output() { echo("" "" "

Hotprofiler Data


" "
" "

 
"); } }; /////////////////////////////////////////////////////////////////////////////// // HierarchicalProfiler class HierarchicalProfiler : public Profiler { private: class CountMap { public: CountMap() : count(0), wall_time(0), cpu(0), memory(0), peak_memory(0) {} int64_t count; int64_t wall_time; int64_t cpu; int64_t memory; int64_t peak_memory; }; typedef hphp_hash_map StatsMap; StatsMap m_stats; // outcome public: public: explicit HierarchicalProfiler(int flags) : m_flags(flags) { } virtual void beginFrameEx() { m_stack->m_tsc_start = cpuCycles(); if (m_flags & TrackCPU) { m_stack->m_vtsc_start = vtsc(m_MHz); } if (m_flags & TrackMemory) { MemoryManager *mm = MemoryManager::TheMemoryManager(); const MemoryUsageStats &stats = mm->getStats(true); m_stack->m_mu_start = stats.usage; m_stack->m_pmu_start = stats.peakUsage; } else if (m_flags & TrackMalloc) { m_stack->m_mu_start = get_allocs(); m_stack->m_pmu_start = get_frees(); } } virtual void endFrameEx() { char symbol[512]; m_stack->getStack(2, symbol, sizeof(symbol)); CountMap &counts = m_stats[symbol]; counts.count++; counts.wall_time += cpuCycles() - m_stack->m_tsc_start; if (m_flags & TrackCPU) { counts.cpu += vtsc(m_MHz) - m_stack->m_vtsc_start; } if (m_flags & TrackMemory) { MemoryManager *mm = MemoryManager::TheMemoryManager(); const MemoryUsageStats &stats = mm->getStats(true); int64_t mu_end = stats.usage; int64_t pmu_end = stats.peakUsage; counts.memory += mu_end - m_stack->m_mu_start; counts.peak_memory += pmu_end - m_stack->m_pmu_start; } else if (m_flags & TrackMalloc) { counts.memory += get_allocs() - m_stack->m_mu_start; counts.peak_memory += get_frees() - m_stack->m_pmu_start; } } virtual void writeStats(Array &ret) { extractStats(ret, m_stats, m_flags, m_MHz); } private: uint32_t m_flags; }; template class walkTraceClass { public: struct Frame { TraceIt trace; int level; int len; }; typedef vector >Recursion; vector >recursion; vector stack; walkTraceClass() : arc_buff_len(200), arc_buff((char*)malloc(200)) {}; ~walkTraceClass() { free((void*)arc_buff); if (recursion.size() > 1) { Recursion::iterator r_it = recursion.begin(); while (++r_it != recursion.end()) { delete[] r_it->first; } } } int arc_buff_len; char *arc_buff; void checkArcBuff(int len) { len = 2*len + HP_STACK_DELIM_LEN + 2; if (len >= arc_buff_len) { arc_buff_len *= 2; arc_buff = (char *)realloc(arc_buff, arc_buff_len); if (arc_buff == NULL) { throw std::bad_alloc(); } } } void incStats(const char *arc, TraceIt tr, const Frame& fr, Stats& stats) { typename Stats::mapped_type& st = stats[arc]; ++st.count; st.wall_time += tr->wall_time - fr.trace->wall_time; st.cpu += tr->cpu - fr.trace->cpu; st.memory += tr->memory - fr.trace->memory; st.peak_memory += tr->peak_memory - fr.trace->peak_memory; } void popFrame(TraceIt tIt, std::vector& stack, Stats& stats) { Frame callee = stack.back(); stack.pop_back(); const char* arc; Frame& caller = stack.back(); char *cp = arc_buff; memcpy(cp, caller.trace->symbol.ptr, caller.len); cp += caller.len; if (caller.level >= 1) { std::pair& lvl = recursion[caller.level]; memcpy(cp, lvl.first, lvl.second); cp += lvl.second; } memcpy(cp, HP_STACK_DELIM, HP_STACK_DELIM_LEN); cp += HP_STACK_DELIM_LEN; memcpy(cp, callee.trace->symbol.ptr, callee.len); cp += callee.len; if (callee.level >= 1) { std::pair& lvl = recursion[callee.level]; memcpy(cp, lvl.first, lvl.second); cp += lvl.second; } *cp = 0; arc = arc_buff; incStats(arc, tIt, callee, stats); } void walk(TraceIt begin, TraceIt end, Stats& stats, std::map &functionLevel) { if (begin == end) { return; } recursion.push_back(std::make_pair((char *)NULL, 0)); while (begin != end && !begin->symbol.ptr) { ++begin; } while (begin != end) { if (begin->symbol.ptr) { unsigned level = ++functionLevel[begin->symbol.ptr]; if (level >= recursion.size()) { char *level_string = new char[8]; sprintf(level_string, "@%u", level); recursion.push_back(std::make_pair(level_string, strlen(level_string))); } Frame fr; fr.trace = begin; fr.level = level - 1; fr.len = strlen(begin->symbol.ptr); checkArcBuff(fr.len); stack.push_back(fr); } else if (stack.size() > 1) { --functionLevel[stack.back().trace->symbol.ptr]; popFrame(begin, stack, stats); } ++begin; } --begin; while (stack.size() > 1) { popFrame(begin, stack, stats); } if (!stack.empty()) { incStats(stack.back().trace->symbol.ptr, begin, stack.back(), stats); } } }; template static void walkTrace(TraceIt begin, TraceIt end, Stats& stats, Fmap &map) { walkTraceClasswalker; walker.walk(begin, end, stats, map); } struct TraceData { int64_t wall_time; int64_t cpu; int64_t memory; int64_t peak_memory; void clear() { wall_time = cpu = memory = peak_memory = 0; } void set(int64_t w, int64_t c, int64_t m, int64_t p) { wall_time = w; cpu = c; memory = m; peak_memory = p; } }; struct TraceEntry : TraceData { union { int64_t index; const char *ptr; } symbol; void rebase(const TraceData& base_vals) { wall_time -= base_vals.wall_time; cpu -= base_vals.cpu; memory -= base_vals.memory; peak_memory -= base_vals.peak_memory; } }; /////////////////////////////////////////////////////////////////////////////// // TraceProfiler static char *xhprof_trace_header = "xhprof trace v0\n%d bytes\n"; static char *xhprof_trace_speed = "%d MHz\n"; class TraceProfiler : public Profiler { public: static TraceEntry *s_trace; static int s_n_backing; int nTrace; bool full; int64_t maxTraceBuffer; int64_t overflowCalls; bool trace_space_available() { // the two slots are reserved for internal use return nTrace < s_n_backing - 3; } bool trace_get_space() { bool track_realloc = FALSE; if (full) { overflowCalls++; return false; } int new_array_size; if (s_n_backing == 0) { new_array_size = RuntimeOption::ProfilerTraceBuffer; } else { new_array_size = s_n_backing * RuntimeOption::ProfilerTraceExpansion; if (maxTraceBuffer != 0 && new_array_size > maxTraceBuffer) { new_array_size = maxTraceBuffer > s_n_backing ? maxTraceBuffer : s_n_backing; } if (new_array_size - nTrace <= 5) { // for this operation to succeed, we need room for the entry we're // adding, two realloc entries, and two entries to mark the end of // the trace. full = true; collectStats("(trace buffer terminated)", s_trace[nTrace++]); return false; } track_realloc = TRUE; } { DECLARE_THREAD_INFO MemoryManager::MaskAlloc masker(info->m_mm); TraceEntry *r = (TraceEntry*)realloc((void *)s_trace, new_array_size * sizeof(TraceEntry)); if (!r) { full = true; if (s_trace) { collectStats("(trace buffer terminated)", s_trace[nTrace++]); } return false; } s_n_backing = new_array_size; s_trace = r; } if (track_realloc) { collectStats("(trace buffer realloc)", s_trace[nTrace++]); collectStats(NULL, s_trace[nTrace++]); } return true; } static pthread_mutex_t s_in_use; class CountMap : public TraceData { public: int64_t count; CountMap() : count(0) { clear(); } }; typedef hphp_hash_map StatsMap; StatsMap m_stats; // outcome explicit TraceProfiler(int flags) : Profiler(), nTrace(0), full(false), m_flags(flags) { if (pthread_mutex_trylock(&s_in_use)) { m_successful = false; } else { char buf[20]; sprintf(buf, "%d", RuntimeOption::ProfilerMaxTraceBuffer); IniSetting::Bind("profiler.max_trace_buffer", buf, ini_on_update_long, &maxTraceBuffer); } overflowCalls = 0; nTrace = 0; } ~TraceProfiler() { if (m_successful) { free(s_trace); nTrace = 0; s_trace = NULL; s_n_backing = 0; pthread_mutex_unlock(&s_in_use); IniSetting::Unbind("profiler.max_trace_buffer"); } } virtual void beginFrame(const char *symbol) { doTrace(symbol); } virtual void endFrame(bool endMain = false) { doTrace(NULL); } virtual void endAllFrames() { Profiler::endAllFrames(); if (s_trace && nTrace < s_n_backing - 1) { collectStats(NULL, s_trace[nTrace++]); full = true; } } void collectStats(const char *symbol, TraceEntry& te) { te.symbol.ptr = symbol; collectStats((TraceData&)te); } void collectStats(TraceData& te) { te.wall_time = cpuCycles(); te.cpu = 0; if (m_flags & TrackCPU) { te.cpu = vtsc(m_MHz); } if (m_flags & TrackMemory) { MemoryManager *mm = MemoryManager::TheMemoryManager(); const MemoryUsageStats &stats = mm->getStats(true); te.memory = stats.usage; te.peak_memory = stats.peakUsage; } else if (m_flags & TrackMalloc) { te.memory = get_allocs(); te.peak_memory = get_frees(); } else { te.memory = 0; te.peak_memory = 0; } } TraceEntry* next_trace() { if (!trace_space_available() && !trace_get_space()) { return 0; } return &s_trace[nTrace++]; } void doTrace(const char *symbol) { TraceEntry *te = next_trace(); if (te != NULL) { collectStats(symbol, *te); } } virtual void writeStats(Array &ret) { std::mapfmap; TraceData my_begin; collectStats(my_begin); walkTrace(s_trace, s_trace + nTrace, m_stats, fmap); if (overflowCalls) { m_stats["(trace buffer terminated)"].count += overflowCalls/2; } extractStats(ret, m_stats, m_flags, m_MHz); if (m_flags & GetTrace) { String traceData; packTraceData(fmap, traceData, m_MHz); ret.set(s_compressed_trace, traceData); fprintf(stderr, "%d bytes\n", traceData.size()); } CountMap trace_buffer; trace_buffer.count = 0; trace_buffer.peak_memory = trace_buffer.memory = nTrace * sizeof(*s_trace); returnVals(ret, "(trace buffer alloc)", trace_buffer, m_flags, m_MHz); if (m_flags & MeasureXhprofDisable) { CountMap my_end; collectStats((TraceData&)my_end); my_end.count = 1; my_end.cpu -= my_begin.cpu; my_end.wall_time -= my_begin.wall_time; my_end.memory -= my_begin.memory; my_end.peak_memory -= my_begin.peak_memory; returnVals(ret, "xhprof_post_processing()", my_end, m_flags, m_MHz); } } void extendBuffer(z_stream &strm, char *&buff, int &size) { int old_size = size; size *= 1.1; buff = (char *)realloc(buff, size); strm.next_out = (Bytef*)(buff + old_size); strm.avail_out = size - old_size; } void deflater(z_stream &strm, int mode, char *&buff, int &size) { while (deflate(&strm, Z_FULL_FLUSH), strm.avail_out == 0) { extendBuffer(strm, buff, size); } } template void packTraceData(fm &fmap, String& traceData, int MHz) { typename fm::iterator fmIt; int symbol_len = 0; char speed_buff[50]; sprintf(speed_buff, xhprof_trace_speed, MHz); int speed_len = strlen(speed_buff); // size the buffer for holding function names for (fmIt = fmap.begin(); fmIt != fmap.end(); ++fmIt) { // record the length of each name fmIt->second = strlen(fmIt->first) + 1; symbol_len += fmIt->second; } ++symbol_len; char namebuff[symbol_len]; char *cp = namebuff; int index = 0; // copy function names, and give each function an index number for (fmIt = fmap.begin(); fmIt != fmap.end(); ++fmIt) { int len = fmIt->second; memcpy((void *)cp, (void *)fmIt->first, len); cp += len; // this index will go in the trace fmIt->second = index++; } *cp = '\0'; // an extra null // map null strings to -1 fmap[NULL] = (typename fm::mapped_type)-1; TraceEntry *te = s_trace; TraceEntry *end = s_trace + nTrace; while (te != end) { te->symbol.index = fmap[te->symbol.ptr]; te->rebase(*s_trace); ++te; } // compress, starting with the list of functions z_stream strm; int ret; strm.zalloc = Z_NULL; strm.zfree = Z_NULL; strm.opaque = Z_NULL; ret = deflateInit(&strm, 3); if (ret != Z_OK) { return; } int trace_size = nTrace * sizeof(TraceEntry); int dp_size = symbol_len/3 + trace_size/5; char *dp_buff = (char*) malloc(dp_size); sprintf(dp_buff, xhprof_trace_header, speed_len + symbol_len + trace_size); strm.next_out = (Bytef*)strchr(dp_buff, 0); strm.avail_out = dp_size - ((char *)strm.next_out - dp_buff); strm.next_in = (Bytef*)speed_buff; strm.avail_in = speed_len; deflater(strm, Z_NO_FLUSH, dp_buff, dp_size); strm.next_in = (Bytef*)namebuff; strm.avail_in = symbol_len; deflater(strm, Z_FULL_FLUSH, dp_buff, dp_size); strm.next_in = (Bytef*)s_trace; strm.avail_in = nTrace * sizeof(TraceEntry); deflater(strm, Z_NO_FLUSH, dp_buff, dp_size); while ((ret = deflate(&strm, Z_FINISH)) != Z_STREAM_END) { extendBuffer(strm, dp_buff, dp_size); } traceData = String(dp_buff, dp_size - strm.avail_out, AttachString); nTrace = 0; } static String unpackTraceData(CStrRef packedTrace) { const char *input = packedTrace.c_str(); int64_t input_length = packedTrace.size(); int output_length; char *output = 0; String s; if (sscanf(input, xhprof_trace_header, &output_length) != 1) { return empty_string; } const char *zipped_begin; if (!(zipped_begin = strchr(input, '\n')) || !(zipped_begin = strchr(zipped_begin + 1, '\n'))) { goto error2_out; } ++zipped_begin; int ret; z_stream strm; strm.zalloc = Z_NULL; strm.zfree = Z_NULL; strm.opaque = Z_NULL; strm.avail_in = 0; strm.next_in = Z_NULL; ret = inflateInit(&strm); if (ret != Z_OK) { goto error2_out; } s = String(output_length, ReserveString); output = s.mutableSlice().ptr; strm.avail_in = input_length - (zipped_begin - input); strm.next_in = (Bytef*)zipped_begin; strm.avail_out = output_length; strm.next_out = (Bytef*)output; switch (inflate(&strm, Z_NO_FLUSH)) { case Z_NEED_DICT: case Z_DATA_ERROR: case Z_MEM_ERROR: goto error_out; } if (strm.avail_out) { goto error_out; } inflateEnd(&strm); return s.setSize(output_length); error_out: free(output); inflateEnd(&strm); error2_out: return empty_string; } private: uint32_t m_flags; }; TraceEntry *TraceProfiler::s_trace = NULL; int TraceProfiler::s_n_backing = 0; pthread_mutex_t TraceProfiler::s_in_use = PTHREAD_MUTEX_INITIALIZER; /////////////////////////////////////////////////////////////////////////////// // SampleProfiler /** * Sampling based profiler. */ class SampleProfiler : public Profiler { private: typedef hphp_hash_map CountMap; typedef hphp_hash_map StatsMap; StatsMap m_stats; // outcome public: SampleProfiler() { struct timeval now; uint64_t truncated_us; uint64_t truncated_tsc; // Init the last_sample in tsc m_last_sample_tsc = cpuCycles(); // Find the microseconds that need to be truncated gettimeofday(&m_last_sample_time, 0); now = m_last_sample_time; hp_trunc_time(&m_last_sample_time, SAMPLING_INTERVAL); // Subtract truncated time from last_sample_tsc truncated_us = get_us_interval(&m_last_sample_time, &now); truncated_tsc = truncated_us * m_MHz; if (m_last_sample_tsc > truncated_tsc) { // just to be safe while subtracting unsigned ints m_last_sample_tsc -= truncated_tsc; } // Convert sampling interval to ticks m_sampling_interval_tsc = SAMPLING_INTERVAL * m_MHz; } virtual void beginFrameEx() { sample_check(); } virtual void endFrameEx() { sample_check(); } virtual void writeStats(Array &ret) { for (StatsMap::const_iterator iter = m_stats.begin(); iter != m_stats.end(); ++iter) { Array arr; const CountMap &counts = iter->second; for (CountMap::const_iterator iterCount = counts.begin(); iterCount != counts.end(); ++iterCount) { arr.set(String(iterCount->first), iterCount->second); } ret.set(String(iter->first), arr); } } private: static const int SAMPLING_INTERVAL = 100000; // microsecs struct timeval m_last_sample_time; uint64_t m_last_sample_tsc; uint64_t m_sampling_interval_tsc; /** * Sample the stack. Add it to the stats_count global. * * @param tv current time * @param entries func stack as linked list of hprof_entry_t * @return void * @author veeve */ void sample_stack() { char key[512]; snprintf(key, sizeof(key), "%" PRId64 ".%06" PRId64, (int64_t)m_last_sample_time.tv_sec, (int64_t)m_last_sample_time.tv_usec); char symbol[5120]; m_stack->getStack(INT_MAX, symbol, sizeof(symbol)); m_stats[key][symbol] = 1; } /** * Checks to see if it is time to sample the stack. * Calls hp_sample_stack() if its time. * * @param entries func stack as linked list of hprof_entry_t * @param last_sample time the last sample was taken * @param sampling_intr sampling interval in microsecs * @return void * @author veeve */ void sample_check() { if (m_stack) { // While loop is to handle a single function taking a long time // and passing several sampling intervals while ((cpuCycles() - m_last_sample_tsc) > m_sampling_interval_tsc) { m_last_sample_tsc += m_sampling_interval_tsc; // HAS TO BE UPDATED BEFORE calling sample_stack incr_us_interval(&m_last_sample_time, SAMPLING_INTERVAL); sample_stack(); } } } }; /////////////////////////////////////////////////////////////////////////////// class ProfilerFactory : public RequestEventHandler { public: enum Level { Simple = 1, Hierarchical = 2, Memory = 3, Trace = 4, Sample = 620002, // Rockfort's zip code }; static bool EnableNetworkProfiler; public: ProfilerFactory() : m_profiler(NULL) { } ~ProfilerFactory() { stop(); } Profiler *getProfiler() { return m_profiler; } virtual void requestInit() { } virtual void requestShutdown() { stop(); m_artificialFrameNames.reset(); } void start(Level level, long flags) { if (!RuntimeOption::EnableHotProfiler) { return; } HPHP::EventHook::Enable(); if (m_profiler == NULL) { switch (level) { case Simple: m_profiler = new SimpleProfiler(); break; case Hierarchical: m_profiler = new HierarchicalProfiler(flags); break; case Sample: m_profiler = new SampleProfiler(); break; case Trace: m_profiler = new TraceProfiler(flags); break; default: throw_invalid_argument("level: %d", level); return; } if (m_profiler->m_successful) { m_profiler->beginFrame("main()"); ThreadInfo::s_threadInfo->m_profiler = m_profiler; } else { delete m_profiler; m_profiler = NULL; } } } Variant stop() { if (m_profiler) { m_profiler->endAllFrames(); Array ret; m_profiler->writeStats(ret); delete m_profiler; m_profiler = NULL; ThreadInfo::s_threadInfo->m_profiler = NULL; return ret; } return uninit_null(); } /** * The whole purpose to make sure "const char *" is safe to take on these * strings. */ void cacheString(CStrRef name) { m_artificialFrameNames.append(name); } private: Profiler *m_profiler; Array m_artificialFrameNames; }; bool ProfilerFactory::EnableNetworkProfiler = false; #ifdef HOTPROFILER IMPLEMENT_STATIC_REQUEST_LOCAL(ProfilerFactory, s_factory); #endif /////////////////////////////////////////////////////////////////////////////// // main functions void f_hotprofiler_enable(int level) { #ifdef HOTPROFILER long flags = 0; if (level == ProfilerFactory::Hierarchical) { flags = TrackBuiltins; } else if (level == ProfilerFactory::Memory) { level = ProfilerFactory::Hierarchical; flags = TrackBuiltins | TrackMemory; } s_factory->start((ProfilerFactory::Level)level, flags); #endif } Variant f_hotprofiler_disable() { #ifdef HOTPROFILER return s_factory->stop(); #else return uninit_null(); #endif } void f_phprof_enable(int flags /* = 0 */) { #ifdef HOTPROFILER s_factory->start(ProfilerFactory::Hierarchical, flags); #endif } Variant f_phprof_disable() { #ifdef HOTPROFILER return s_factory->stop(); #else return uninit_null(); #endif } void f_fb_setprofile(CVarRef callback) { #ifdef HOTPROFILER if (ThreadInfo::s_threadInfo->m_profiler != NULL) { // phpprof is enabled, don't let PHP code override it return; } #endif g_vmContext->m_setprofileCallback = callback; if (callback.isNull()) { HPHP::EventHook::Disable(); } else { HPHP::EventHook::Enable(); } } void f_xhprof_frame_begin(CStrRef name) { #ifdef HOTPROFILER Profiler *prof = ThreadInfo::s_threadInfo->m_profiler; if (prof) { s_factory->cacheString(name); prof->beginFrame(name.data()); } #endif } void f_xhprof_frame_end() { #ifdef HOTPROFILER Profiler *prof = ThreadInfo::s_threadInfo->m_profiler; if (prof) { prof->endFrame(); } #endif } void f_xhprof_enable(int flags/* = 0 */, CArrRef args /* = null_array */) { #ifdef HOTPROFILER #ifdef CLOCK_THREAD_CPUTIME_ID bool missingClockGetTimeNS = Util::Vdso::ClockGetTimeNS(CLOCK_THREAD_CPUTIME_ID) == -1; #else bool missingClockGetTimeNS = true; #endif if (vtsc_syscall.num <= 0 && missingClockGetTimeNS) { flags &= ~TrackVtsc; } if (flags & TrackVtsc) { flags |= TrackCPU; } if (flags & XhpTrace) { s_factory->start(ProfilerFactory::Trace, flags); } else { s_factory->start(ProfilerFactory::Hierarchical, flags); } #endif } Variant f_xhprof_disable() { #ifdef HOTPROFILER return s_factory->stop(); #else return uninit_null(); #endif } void f_xhprof_network_enable() { ServerStats::StartNetworkProfile(); } Variant f_xhprof_network_disable() { return ServerStats::EndNetworkProfile(); } void f_xhprof_sample_enable() { #ifdef HOTPROFILER s_factory->start(ProfilerFactory::Sample, 0); #endif } Variant f_xhprof_sample_disable() { #ifdef HOTPROFILER return s_factory->stop(); #else return uninit_null(); #endif } Variant f_xhprof_run_trace(CStrRef packedTrace, int flags) { #ifdef HOTPROFILER String traceData = TraceProfiler::unpackTraceData(packedTrace); // we really don't want to copy this char *syms = const_cast(traceData.c_str()); int MHz; if (sscanf(syms, xhprof_trace_speed, &MHz) != 1) { return uninit_null(); } vectorsymbols; char *sym = strchr(syms, '\n') + 1; char *esym = strchr(sym, '\0'); while (sym != esym) { symbols.push_back(sym); sym = esym + 1; esym = strchr(sym, '\0'); } TraceEntry *begin = (TraceEntry*)(esym + 1); TraceEntry *end = (TraceEntry*)(syms + traceData.size()); for (TraceEntry *toup = begin; toup != end; ++toup) { int symIndex = toup->symbol.index; if (symIndex >= 0 && (uint64_t)symIndex < symbols.size()) { toup->symbol.ptr = symbols[toup->symbol.index]; } else if (symIndex == -1) { toup->symbol.ptr = NULL; } else { // corrupt trace return uninit_null(); } } std::mapfmap; Array result; TraceProfiler::StatsMap stats; walkTrace(&*begin, &*end, stats, fmap); Profiler::extractStats(result, stats, flags, MHz); return result; #else return uninit_null(); #endif } /////////////////////////////////////////////////////////////////////////////// // constants const int64_t k_XHPROF_FLAGS_NO_BUILTINS = TrackBuiltins; const int64_t k_XHPROF_FLAGS_CPU = TrackCPU; const int64_t k_XHPROF_FLAGS_MEMORY = TrackMemory; const int64_t k_XHPROF_FLAGS_VTSC = TrackVtsc; const int64_t k_XHPROF_FLAGS_TRACE = XhpTrace; const int64_t k_XHPROF_FLAGS_MEASURE_XHPROF_DISABLE = MeasureXhprofDisable; const int64_t k_XHPROF_FLAGS_GET_TRACE = GetTrace; const int64_t k_XHPROF_FLAGS_MALLOC = TrackMalloc; /////////////////////////////////////////////////////////////////////////////// // injected code void begin_profiler_frame(Profiler *p, const char *symbol) { p->beginFrame(symbol); } void end_profiler_frame(Profiler *p) { p->endFrame(); } /////////////////////////////////////////////////////////////////////////////// }