Introduce rdtsc, cycle delay primitives.

I keep reintroducing these by hand. My workflow is usually:

1. Have an idea to optimize X.
2. Realize I don't even know if X is important or not.
3. Instrument X to slow it down, to see if it makes a difference in
   perflab.

After this diff, 3 is a simple matter of inserting a call to
cycleDelay(1000) (or what have you) in the path of interest.
Esse commit está contido em:
Keith Adams
2013-06-27 14:35:22 -07:00
commit de Sara Golemon
commit a0ae83d0fd
3 arquivos alterados com 67 adições e 30 exclusões
+11 -30
Ver Arquivo
@@ -24,6 +24,7 @@
#include "hphp/runtime/vm/event_hook.h"
#include "hphp/util/alloc.h"
#include "hphp/util/vdso.h"
#include "hphp/util/cycles.h"
#ifdef __FreeBSD__
# include <sys/resource.h>
@@ -64,6 +65,7 @@
#define HP_STACK_DELIM_LEN (sizeof(HP_STACK_DELIM) - 1)
namespace HPHP {
IMPLEMENT_DEFAULT_EXTENSION(hotprofiler);
IMPLEMENT_DEFAULT_EXTENSION(xhprof);
///////////////////////////////////////////////////////////////////////////////
@@ -137,27 +139,6 @@ static void hp_trunc_time(struct timeval *tv, uint64_t intr) {
///////////////////////////////////////////////////////////////////////////////
// High precision timer related functions.
/**
* Get time stamp counter (TSC) value via 'rdtsc' instruction.
*
* @return 64 bit unsigned integer
* @author cjiang
*/
inline uint64_t tsc() {
#ifdef __x86_64__
uint32_t __a,__d;
uint64_t val;
asm volatile("rdtsc" : "=a" (__a), "=d" (__d));
(val) = ((uint64_t)__a) | (((uint64_t)__d)<<32);
return val;
#else
// TODO(2200461): rdtsc isn't portable. Ideally we'd use some higher-level
// portable API (clock_gettime maybe?), but that may break assumptions that
// clients of this API make about how the underlying clock works.
return 0;
#endif
}
/**
* This is a microbenchmark to get cpu frequency the process is running on. The
* returned value is used to convert TSC counter values to microseconds.
@@ -173,7 +154,7 @@ static int64_t get_cpu_frequency() {
perror("gettimeofday");
return 0.0;
}
uint64_t tsc_start = tsc();
uint64_t tsc_start = cpuCycles();
// Sleep for 5 miliseconds. Comparaing with gettimeofday's few microseconds
// execution time, this should be enough.
usleep(5000);
@@ -181,7 +162,7 @@ static int64_t get_cpu_frequency() {
perror("gettimeofday");
return 0.0;
}
uint64_t tsc_end = tsc();
uint64_t tsc_end = cpuCycles();
return nearbyint((tsc_end - tsc_start) * 1.0
/ (get_us_interval(&start, &end)));
}
@@ -747,14 +728,14 @@ public:
}
virtual void beginFrameEx() {
m_stack->m_tsc_start = tsc();
m_stack->m_tsc_start = cpuCycles();
m_stack->m_vtsc_start = vtsc(m_MHz);
}
virtual void endFrameEx() {
CountMap &counts = m_stats[m_stack->m_name];
counts.count++;
counts.tsc += tsc() - m_stack->m_tsc_start;
counts.tsc += cpuCycles() - m_stack->m_tsc_start;
counts.vtsc += vtsc(m_MHz) - m_stack->m_vtsc_start;
}
@@ -812,7 +793,7 @@ public:
}
virtual void beginFrameEx() {
m_stack->m_tsc_start = tsc();
m_stack->m_tsc_start = cpuCycles();
if (m_flags & TrackCPU) {
m_stack->m_vtsc_start = vtsc(m_MHz);
@@ -834,7 +815,7 @@ public:
m_stack->getStack(2, symbol, sizeof(symbol));
CountMap &counts = m_stats[symbol];
counts.count++;
counts.wall_time += tsc() - m_stack->m_tsc_start;
counts.wall_time += cpuCycles() - m_stack->m_tsc_start;
if (m_flags & TrackCPU) {
counts.cpu += vtsc(m_MHz) - m_stack->m_vtsc_start;
@@ -1145,7 +1126,7 @@ public:
}
void collectStats(TraceData& te) {
te.wall_time = tsc();
te.wall_time = cpuCycles();
te.cpu = 0;
if (m_flags & TrackCPU) {
te.cpu = vtsc(m_MHz);
@@ -1393,7 +1374,7 @@ public:
uint64_t truncated_tsc;
// Init the last_sample in tsc
m_last_sample_tsc = tsc();
m_last_sample_tsc = cpuCycles();
// Find the microseconds that need to be truncated
gettimeofday(&m_last_sample_time, 0);
@@ -1473,7 +1454,7 @@ private:
if (m_stack) {
// While loop is to handle a single function taking a long time
// and passing several sampling intervals
while ((tsc() - m_last_sample_tsc) > m_sampling_interval_tsc) {
while ((cpuCycles() - m_last_sample_tsc) > m_sampling_interval_tsc) {
m_last_sample_tsc += m_sampling_interval_tsc;
// HAS TO BE UPDATED BEFORE calling sample_stack
incr_us_interval(&m_last_sample_time, SAMPLING_INTERVAL);
+1
Ver Arquivo
@@ -59,6 +59,7 @@ typedef __sighandler_t *sighandler_t;
#include "hphp/util/meta.h"
#include "hphp/util/util.h"
#include "hphp/util/repo_schema.h"
#include "hphp/util/cycles.h"
#include "hphp/runtime/vm/bytecode.h"
#include "hphp/runtime/vm/php_debug.h"
+55
Ver Arquivo
@@ -0,0 +1,55 @@
/*
+----------------------------------------------------------------------+
| HipHop for PHP |
+----------------------------------------------------------------------+
| Copyright (c) 2010-2013 Facebook, Inc. (http://www.facebook.com) |
| Copyright (c) 1997-2010 The PHP Group |
+----------------------------------------------------------------------+
| This source file is subject to version 3.01 of the PHP license, |
| that is bundled with this package in the file LICENSE, and is |
| available through the world-wide-web at the following url: |
| http://www.php.net/license/3_01.txt |
| If you did not receive a copy of the PHP license and are unable to |
| obtain it through the world-wide-web, please send a note to |
| license@php.net so we can mail you a copy immediately. |
+----------------------------------------------------------------------+
*/
#ifndef incl_HPHP_TSC_H_
#define incl_HPHP_TSC_H_
#include "hphp/util/assertions.h"
namespace HPHP {
/*
* Return the underlying machine cycle counter. While this is slightly
* non-portable in theory, all the CPUs you're likely to care about support
* it in some way or another.
*/
inline uint64_t cpuCycles() {
#ifdef __x86_64__
uint64_t lo, hi;
asm volatile("rdtsc" : "=a"((lo)),"=d"(hi));
return lo | (hi << 32);
#else
not_implemented();
#endif
}
inline void cpuRelax() {
#ifdef __x86_64__
asm volatile("pause");
#endif
}
inline void cycleDelay(uint32_t numCycles) {
auto start = cpuCycles();
do {
if (numCycles > 100) cpuRelax();
} while (cpuCycles() - start > numCycles);
}
}
#endif