ch3/nemesis: Remove unsupported LMTs
knem and vmsplice LMTs are not regularly tested. Downstream maintainers confirm they are not actively using these modules, so just remove them.
Esse commit está contido em:
@@ -782,11 +782,7 @@ is too big (> MPIU_SHMW_GHND_SZ)
|
||||
**abi_version_mismatch %D %D:ABI version mismatch, expected %D - got %D
|
||||
**recv_status:receive status failed
|
||||
**recv_status %d:receive status failed %d
|
||||
**invalid_knem_status:Invalid knem status value
|
||||
**invalid_knem_status %d:Invalid knem status value - %d
|
||||
|
||||
**vmsplice:vmsplice failed
|
||||
**vmsplice %d %s:vmsplice failed - errno %d (%s)
|
||||
**mkfifo:mkfifo failed
|
||||
**mkfifo %d %s:mkfifo failed - errno %d (%s)
|
||||
**tempnam:tempnam failed
|
||||
|
||||
@@ -26,6 +26,4 @@ mpi_core_sources += \
|
||||
src/mpid/ch3/channels/nemesis/src/mpid_nem_debug.c \
|
||||
src/mpid/ch3/channels/nemesis/src/mpid_nem_lmt.c \
|
||||
src/mpid/ch3/channels/nemesis/src/mpid_nem_lmt_shm.c \
|
||||
src/mpid/ch3/channels/nemesis/src/mpid_nem_net_array.c \
|
||||
src/mpid/ch3/channels/nemesis/src/mpid_nem_lmt_dma.c \
|
||||
src/mpid/ch3/channels/nemesis/src/mpid_nem_lmt_vmsplice.c
|
||||
src/mpid/ch3/channels/nemesis/src/mpid_nem_net_array.c
|
||||
|
||||
@@ -1,644 +0,0 @@
|
||||
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
|
||||
/*
|
||||
* (C) 2008 by Argonne National Laboratory.
|
||||
* See COPYRIGHT in top-level directory.
|
||||
*/
|
||||
#include "mpid_nem_impl.h"
|
||||
#include "mpid_nem_datatypes.h"
|
||||
|
||||
/*
|
||||
=== BEGIN_MPI_T_CVAR_INFO_BLOCK ===
|
||||
|
||||
cvars:
|
||||
- name : MPIR_CVAR_NEMESIS_LMT_DMA_THRESHOLD
|
||||
category : NEMESIS
|
||||
type : int
|
||||
default : 2097152
|
||||
class : none
|
||||
verbosity : MPI_T_VERBOSITY_USER_BASIC
|
||||
scope : MPI_T_SCOPE_ALL_EQ
|
||||
description : >-
|
||||
Messages larger than this size will use the "dma" (knem)
|
||||
intranode LMT implementation, if it is enabled and available.
|
||||
|
||||
=== END_MPI_T_CVAR_INFO_BLOCK ===
|
||||
*/
|
||||
|
||||
MPL_SUPPRESS_OSX_HAS_NO_SYMBOLS_WARNING;
|
||||
|
||||
#if defined(HAVE_KNEM_IO_H)
|
||||
|
||||
#include "knem_io.h"
|
||||
|
||||
static int knem_fd = -1;
|
||||
static int knem_has_dma = 0;
|
||||
|
||||
/* 4096 status index */
|
||||
static volatile knem_status_t *knem_status = MAP_FAILED;
|
||||
#define KNEM_STATUS_NR 4096 /* FIXME: randomly chosen */
|
||||
|
||||
/* Values of KNEM_ABI_VERSION less than this are the old interface (pre-0.7),
|
||||
* values greater than or equal to this are the newer interface. At some point
|
||||
* in the future we should drop support for the old version to keep the code
|
||||
* simpler. */
|
||||
#define MPICH_NEW_KNEM_ABI_VERSION (0x0000000c)
|
||||
|
||||
/* These are for maintaining a linked-list of outstanding requests on which we
|
||||
can make progress. */
|
||||
struct lmt_dma_node {
|
||||
struct lmt_dma_node *next;
|
||||
MPIDI_VC_t *vc; /* seems like this should be in the request somewhere, but it's not */
|
||||
MPIR_Request *req; /* do we need to store type too? */
|
||||
volatile knem_status_t *status_p;
|
||||
};
|
||||
|
||||
/* MT: this stack is not thread-safe */
|
||||
static struct lmt_dma_node *outstanding_head = NULL;
|
||||
|
||||
/* MT: this stack is not thread-safe */
|
||||
static int free_idx; /* is always the index of the next free index */
|
||||
static int index_stack[KNEM_STATUS_NR];
|
||||
|
||||
/* returns an index into knem_status that is available for use */
|
||||
static int alloc_status_index(void)
|
||||
{
|
||||
return index_stack[free_idx++];
|
||||
}
|
||||
|
||||
/* returns the given index to the pool */
|
||||
static void free_status_index(int index)
|
||||
{
|
||||
index_stack[--free_idx] = index;
|
||||
}
|
||||
|
||||
|
||||
/* Opens the knem device and sets knem_fd accordingly. Uses mpich errhandling
|
||||
conventions. */
|
||||
static int open_knem_dev(void)
|
||||
{
|
||||
int mpi_errno = MPI_SUCCESS;
|
||||
int err;
|
||||
int i;
|
||||
struct knem_cmd_info info;
|
||||
|
||||
knem_fd = open(KNEM_DEVICE_FILENAME, O_RDWR);
|
||||
MPIR_ERR_CHKANDJUMP2(knem_fd < 0, mpi_errno, MPI_ERR_OTHER, "**shm_open",
|
||||
"**shm_open %s %d", KNEM_DEVICE_FILENAME, errno);
|
||||
err = ioctl(knem_fd, KNEM_CMD_GET_INFO, &info);
|
||||
MPIR_ERR_CHKANDJUMP2(err < 0, mpi_errno, MPI_ERR_OTHER, "**ioctl",
|
||||
"**ioctl %d %s", errno, MPIR_Strerror(errno));
|
||||
MPIR_ERR_CHKANDJUMP2(info.abi != KNEM_ABI_VERSION, mpi_errno, MPI_ERR_OTHER,
|
||||
"**abi_version_mismatch", "**abi_version_mismatch %D %D",
|
||||
(unsigned long)KNEM_ABI_VERSION, (unsigned long)info.abi);
|
||||
|
||||
knem_has_dma = (info.features & KNEM_FEATURE_DMA);
|
||||
|
||||
knem_status = MPL_mmap(NULL, KNEM_STATUS_NR, PROT_READ|PROT_WRITE, MAP_SHARED, knem_fd, KNEM_STATUS_ARRAY_FILE_OFFSET, MPL_MEM_SHM);
|
||||
MPIR_ERR_CHKANDJUMP1(knem_status == MAP_FAILED, mpi_errno, MPI_ERR_OTHER, "**mmap",
|
||||
"**mmap %d", errno);
|
||||
for (i = 0; i < KNEM_STATUS_NR; ++i) {
|
||||
index_stack[i] = i;
|
||||
}
|
||||
fn_fail:
|
||||
return mpi_errno;
|
||||
}
|
||||
|
||||
/* Sends as much data from the request as possible via the knem ioctl.
|
||||
s_cookiep is an output parameter */
|
||||
static int do_dma_send(MPIDI_VC_t *vc, MPIR_Request *sreq, int send_iov_n,
|
||||
MPL_IOV send_iov[], knem_cookie_t *s_cookiep)
|
||||
{
|
||||
int mpi_errno = MPI_SUCCESS;
|
||||
int i, err;
|
||||
#if KNEM_ABI_VERSION < MPICH_NEW_KNEM_ABI_VERSION
|
||||
struct knem_cmd_init_send_param sendcmd;
|
||||
#else
|
||||
struct knem_cmd_create_region cr;
|
||||
#endif
|
||||
struct knem_cmd_param_iovec knem_iov[MPL_IOV_LIMIT];
|
||||
|
||||
/* FIXME The knem module iovec is potentially different from the system
|
||||
iovec. This causes all sorts of fun if you don't realize it and use the
|
||||
system iovec directly instead. Eventually we need to either unify them
|
||||
or avoid this extra copy. */
|
||||
for (i = 0; i < send_iov_n; ++i) {
|
||||
knem_iov[i].base = (uintptr_t)send_iov[i] .MPL_IOV_BUF;
|
||||
knem_iov[i].len = send_iov[i] .MPL_IOV_LEN;
|
||||
}
|
||||
|
||||
#if KNEM_ABI_VERSION < MPICH_NEW_KNEM_ABI_VERSION
|
||||
sendcmd.send_iovec_array = (uintptr_t) &knem_iov[0];
|
||||
sendcmd.send_iovec_nr = send_iov_n;
|
||||
sendcmd.flags = 0;
|
||||
err = ioctl(knem_fd, KNEM_CMD_INIT_SEND, &sendcmd);
|
||||
#else
|
||||
cr.iovec_array = (uintptr_t) &knem_iov[0];
|
||||
cr.iovec_nr = send_iov_n;
|
||||
cr.flags = KNEM_FLAG_SINGLEUSE;
|
||||
cr.protection = PROT_READ;
|
||||
err = ioctl(knem_fd, KNEM_CMD_CREATE_REGION, &cr);
|
||||
#endif
|
||||
MPIR_ERR_CHKANDJUMP2(err < 0, mpi_errno, MPI_ERR_OTHER, "**ioctl",
|
||||
"**ioctl %d %s", errno, MPIR_Strerror(errno));
|
||||
#if KNEM_ABI_VERSION < MPICH_NEW_KNEM_ABI_VERSION
|
||||
*s_cookiep = sendcmd.send_cookie;
|
||||
#else
|
||||
*s_cookiep = cr.cookie;
|
||||
#endif
|
||||
|
||||
fn_fail:
|
||||
fn_exit:
|
||||
return mpi_errno;
|
||||
}
|
||||
|
||||
/* s_cookie is an input parameter */
|
||||
static int do_dma_recv(int iov_n, MPL_IOV iov[], knem_cookie_t s_cookie, int nodma, volatile knem_status_t **status_p_p, knem_status_t *current_status_p)
|
||||
{
|
||||
int mpi_errno = MPI_SUCCESS;
|
||||
int i, err;
|
||||
|
||||
#if KNEM_ABI_VERSION < MPICH_NEW_KNEM_ABI_VERSION
|
||||
struct knem_cmd_init_async_recv_param recvcmd = {0};
|
||||
#else
|
||||
struct knem_cmd_inline_copy icopy;
|
||||
#endif
|
||||
struct knem_cmd_param_iovec knem_iov[MPL_IOV_LIMIT];
|
||||
|
||||
/* FIXME The knem module iovec is potentially different from the system
|
||||
iovec. This causes all sorts of fun if you don't realize it and use the
|
||||
system iovec directly instead. Eventually we need to either unify them
|
||||
or avoid this extra copy. */
|
||||
for (i = 0; i < iov_n; ++i) {
|
||||
knem_iov[i].base = (uintptr_t)iov[i] .MPL_IOV_BUF;
|
||||
knem_iov[i].len = iov[i] .MPL_IOV_LEN;
|
||||
}
|
||||
|
||||
#if KNEM_ABI_VERSION < MPICH_NEW_KNEM_ABI_VERSION
|
||||
recvcmd.recv_iovec_array = (uintptr_t) &knem_iov[0];
|
||||
recvcmd.recv_iovec_nr = iov_n;
|
||||
recvcmd.status_index = alloc_status_index();
|
||||
recvcmd.send_cookie = s_cookie;
|
||||
recvcmd.flags = nodma ? 0 : KNEM_FLAG_DMA | KNEM_FLAG_ASYNCDMACOMPLETE;
|
||||
err = ioctl(knem_fd, KNEM_CMD_INIT_ASYNC_RECV, &recvcmd);
|
||||
#else
|
||||
icopy.local_iovec_array = (uintptr_t) &knem_iov[0];
|
||||
icopy.local_iovec_nr = iov_n;
|
||||
icopy.remote_cookie = s_cookie;
|
||||
icopy.remote_offset = 0;
|
||||
icopy.write = 0;
|
||||
icopy.async_status_index = alloc_status_index();
|
||||
icopy.flags = nodma ? 0 : KNEM_FLAG_DMA | KNEM_FLAG_ASYNCDMACOMPLETE;
|
||||
err = ioctl(knem_fd, KNEM_CMD_INLINE_COPY, &icopy);
|
||||
#endif
|
||||
MPIR_ERR_CHKANDJUMP2(err < 0, mpi_errno, MPI_ERR_OTHER, "**ioctl",
|
||||
"**ioctl %d %s", errno, MPIR_Strerror(errno));
|
||||
|
||||
#if KNEM_ABI_VERSION < MPICH_NEW_KNEM_ABI_VERSION
|
||||
*status_p_p = &knem_status[recvcmd.status_index];
|
||||
*current_status_p = KNEM_STATUS_PENDING;
|
||||
#else
|
||||
*status_p_p = &knem_status[icopy.async_status_index];
|
||||
*current_status_p = icopy.current_status;
|
||||
#endif
|
||||
|
||||
fn_exit:
|
||||
return mpi_errno;
|
||||
fn_fail:
|
||||
goto fn_exit;
|
||||
}
|
||||
|
||||
/* Much like initiate_lmt except it won't send an RTS message. Used to
|
||||
implement initiate_lmt and handle_cookie. This will send as much data from
|
||||
the request in a single shot as possible.
|
||||
|
||||
s_cookiep is an output parameter. */
|
||||
static int send_sreq_data(MPIDI_VC_t *vc, MPIR_Request *sreq, knem_cookie_t *s_cookiep)
|
||||
{
|
||||
int mpi_errno = MPI_SUCCESS;
|
||||
int dt_contig;
|
||||
MPI_Aint dt_true_lb;
|
||||
intptr_t data_sz;
|
||||
MPIR_Datatype* dt_ptr;
|
||||
|
||||
/* MT: this code assumes only one thread can be at this point at a time */
|
||||
if (knem_fd < 0) {
|
||||
mpi_errno = open_knem_dev();
|
||||
if (mpi_errno) MPIR_ERR_POP(mpi_errno);
|
||||
}
|
||||
|
||||
/* find out contig/noncontig, size, and lb for the datatype */
|
||||
MPIDI_Datatype_get_info(sreq->dev.user_count, sreq->dev.datatype,
|
||||
dt_contig, data_sz, dt_ptr, dt_true_lb);
|
||||
|
||||
if (dt_contig) {
|
||||
/* handle the iov creation ourselves */
|
||||
sreq->dev.iov[0].MPL_IOV_BUF = (char *)sreq->dev.user_buf + dt_true_lb;
|
||||
sreq->dev.iov[0].MPL_IOV_LEN = data_sz;
|
||||
sreq->dev.iov_count = 1;
|
||||
}
|
||||
else {
|
||||
/* use the segment routines to handle the iovec creation */
|
||||
if (sreq->dev.msg_offset == 0) {
|
||||
sreq->dev.iov_count = MPL_IOV_LIMIT;
|
||||
sreq->dev.iov_offset = 0;
|
||||
|
||||
sreq->dev.msgsize = data_sz;
|
||||
|
||||
/* FIXME we should write our own function that isn't dependent on
|
||||
the in-request iov array. This will let us use IOVs that are
|
||||
larger than MPL_IOV_LIMIT. */
|
||||
mpi_errno = MPIDI_CH3U_Request_load_send_iov(sreq, &sreq->dev.iov[0],
|
||||
&sreq->dev.iov_count);
|
||||
if (mpi_errno) MPIR_ERR_POP(mpi_errno);
|
||||
}
|
||||
}
|
||||
|
||||
mpi_errno = do_dma_send(vc, sreq, sreq->dev.iov_count, sreq->dev.iov, s_cookiep);
|
||||
if (mpi_errno) MPIR_ERR_POP(mpi_errno);
|
||||
|
||||
fn_exit:
|
||||
return mpi_errno;
|
||||
fn_fail:
|
||||
goto fn_exit;
|
||||
}
|
||||
|
||||
static inline int check_req_complete(MPIDI_VC_t *vc, MPIR_Request *req, int *complete)
|
||||
{
|
||||
int mpi_errno = MPI_SUCCESS;
|
||||
int (*reqFn)(MPIDI_VC_t *, MPIR_Request *, int *);
|
||||
reqFn = req->dev.OnDataAvail;
|
||||
if (reqFn) {
|
||||
*complete = 0;
|
||||
mpi_errno = reqFn(vc, req, complete);
|
||||
if (mpi_errno) MPIR_ERR_POP(mpi_errno);
|
||||
}
|
||||
else {
|
||||
*complete = 1;
|
||||
mpi_errno = MPID_Request_complete(req);
|
||||
if (mpi_errno != MPI_SUCCESS) {
|
||||
MPIR_ERR_POP(mpi_errno);
|
||||
}
|
||||
}
|
||||
|
||||
fn_fail:
|
||||
return mpi_errno;
|
||||
}
|
||||
|
||||
|
||||
int MPID_nem_lmt_dma_initiate_lmt(MPIDI_VC_t *vc, MPIDI_CH3_Pkt_t *pkt, MPIR_Request *sreq)
|
||||
{
|
||||
int mpi_errno = MPI_SUCCESS;
|
||||
MPID_nem_pkt_lmt_rts_t * const rts_pkt = (MPID_nem_pkt_lmt_rts_t *)pkt;
|
||||
MPIR_CHKPMEM_DECL(1);
|
||||
MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPID_NEM_LMT_DMA_INITIATE_LMT);
|
||||
|
||||
MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_MPID_NEM_LMT_DMA_INITIATE_LMT);
|
||||
|
||||
MPIR_CHKPMEM_MALLOC(sreq->ch.s_cookie, knem_cookie_t *, sizeof(knem_cookie_t), mpi_errno, "s_cookie", MPL_MEM_BUFFER);
|
||||
|
||||
mpi_errno = send_sreq_data(vc, sreq, sreq->ch.s_cookie);
|
||||
if (mpi_errno) MPIR_ERR_POP(mpi_errno);
|
||||
MPID_nem_lmt_send_RTS(vc, rts_pkt, sreq->ch.s_cookie, sizeof(knem_cookie_t));
|
||||
|
||||
fn_exit:
|
||||
MPIR_CHKPMEM_COMMIT();
|
||||
MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_MPID_NEM_LMT_DMA_INITIATE_LMT);
|
||||
return mpi_errno;
|
||||
fn_fail:
|
||||
MPIR_CHKPMEM_REAP();
|
||||
goto fn_exit;
|
||||
}
|
||||
|
||||
/* This function is called initially when an RTS message comes in, but may also
|
||||
be called by the COOKIE handler in the non-contiguous case to process
|
||||
additional IOVs. */
|
||||
int MPID_nem_lmt_dma_start_recv(MPIDI_VC_t *vc, MPIR_Request *rreq, MPL_IOV s_cookie)
|
||||
{
|
||||
int mpi_errno = MPI_SUCCESS;
|
||||
int nodma;
|
||||
int dt_contig;
|
||||
MPI_Aint dt_true_lb;
|
||||
intptr_t data_sz;
|
||||
MPIR_Datatype* dt_ptr;
|
||||
volatile knem_status_t *status;
|
||||
knem_status_t current_status;
|
||||
struct lmt_dma_node *node = NULL;
|
||||
MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPID_NEM_LMT_DMA_START_RECV);
|
||||
|
||||
MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_MPID_NEM_LMT_DMA_START_RECV);
|
||||
|
||||
/* MT: this code assumes only one thread can be at this point at a time */
|
||||
if (knem_fd < 0) {
|
||||
mpi_errno = open_knem_dev();
|
||||
if (mpi_errno) MPIR_ERR_POP(mpi_errno);
|
||||
}
|
||||
|
||||
/* find out contig/noncontig, size, and lb for the datatype */
|
||||
MPIDI_Datatype_get_info(rreq->dev.user_count, rreq->dev.datatype,
|
||||
dt_contig, data_sz, dt_ptr, dt_true_lb);
|
||||
|
||||
nodma = !knem_has_dma || data_sz < MPIR_CVAR_NEMESIS_LMT_DMA_THRESHOLD;
|
||||
|
||||
if (dt_contig) {
|
||||
/* handle the iov creation ourselves */
|
||||
rreq->dev.iov[0].MPL_IOV_BUF = (char *)rreq->dev.user_buf + dt_true_lb;
|
||||
rreq->dev.iov[0].MPL_IOV_LEN = data_sz;
|
||||
rreq->dev.iov_count = 1;
|
||||
}
|
||||
else {
|
||||
if (rreq->dev.msg_offset == 0) {
|
||||
rreq->dev.msgsize = data_sz;
|
||||
|
||||
/* see load_send_iov FIXME above */
|
||||
mpi_errno = MPIDI_CH3U_Request_load_recv_iov(rreq);
|
||||
if (mpi_errno) MPIR_ERR_POP(mpi_errno);
|
||||
}
|
||||
}
|
||||
|
||||
MPIR_Assert(s_cookie.MPL_IOV_LEN == sizeof(knem_cookie_t));
|
||||
MPIR_Assert(s_cookie.MPL_IOV_BUF != NULL);
|
||||
mpi_errno = do_dma_recv(rreq->dev.iov_count, rreq->dev.iov,
|
||||
*((knem_cookie_t *)s_cookie.MPL_IOV_BUF), nodma,
|
||||
&status, ¤t_status);
|
||||
if (mpi_errno) MPIR_ERR_POP(mpi_errno);
|
||||
|
||||
/* TODO refactor this block and MPID_nem_lmt_dma_progress (and anywhere
|
||||
* else) to share a common function. This advancement/completion code is
|
||||
* duplication. */
|
||||
if (current_status != KNEM_STATUS_PENDING) {
|
||||
/* complete the request if all data has been sent, remove it from the list */
|
||||
int complete = 0;
|
||||
|
||||
MPIR_ERR_CHKANDJUMP1(current_status == KNEM_STATUS_FAILED, mpi_errno, MPI_ERR_OTHER,
|
||||
"**recv_status", "**recv_status %d", current_status);
|
||||
|
||||
mpi_errno = check_req_complete(vc, rreq, &complete);
|
||||
if (mpi_errno) MPIR_ERR_POP(mpi_errno);
|
||||
|
||||
free_status_index(status - knem_status);
|
||||
|
||||
if (complete) {
|
||||
/* request was completed by the OnDataAvail fn */
|
||||
MPID_nem_lmt_send_DONE(vc, rreq); /* tell the other side to complete its request */
|
||||
MPL_DBG_MSG(MPIDI_CH3_DBG_CHANNEL, VERBOSE, ".... complete");
|
||||
|
||||
}
|
||||
else {
|
||||
/* There is more data to send. We must inform the sender that we have
|
||||
completely received the current batch and that the next batch should
|
||||
be sent. */
|
||||
MPID_nem_lmt_send_COOKIE(vc, rreq, NULL, 0);
|
||||
}
|
||||
}
|
||||
|
||||
/* XXX DJG FIXME this looks like it always pushes! */
|
||||
/* push request if not complete for progress checks later */
|
||||
node = MPL_malloc(sizeof(struct lmt_dma_node), MPL_MEM_OTHER);
|
||||
node->vc = vc;
|
||||
node->req = rreq;
|
||||
node->status_p = status;
|
||||
node->next = outstanding_head;
|
||||
outstanding_head = node;
|
||||
++MPID_nem_local_lmt_pending;
|
||||
|
||||
fn_exit:
|
||||
MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_MPID_NEM_LMT_DMA_START_RECV);
|
||||
return mpi_errno;
|
||||
fn_fail:
|
||||
goto fn_exit;
|
||||
}
|
||||
|
||||
int MPID_nem_lmt_dma_done_send(MPIDI_VC_t *vc, MPIR_Request *sreq)
|
||||
{
|
||||
int mpi_errno = MPI_SUCCESS;
|
||||
int complete = 0;
|
||||
int (*reqFn)(MPIDI_VC_t *, MPIR_Request *, int *);
|
||||
MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPID_NEM_LMT_DMA_DONE_SEND);
|
||||
|
||||
MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_MPID_NEM_LMT_DMA_DONE_SEND);
|
||||
|
||||
/* free cookie from RTS packet */
|
||||
MPL_free(sreq->ch.s_cookie);
|
||||
|
||||
/* We shouldn't ever need to handle the more IOVs case here. The DONE
|
||||
message should only be sent when all of the data is truly transferred.
|
||||
However in the interest of robustness, we'll start to handle it and
|
||||
assert if it looks like we were supposed to send more data for some
|
||||
reason. */
|
||||
reqFn = sreq->dev.OnDataAvail;
|
||||
if (!reqFn) {
|
||||
mpi_errno = MPID_Request_complete(sreq);
|
||||
if (mpi_errno != MPI_SUCCESS) {
|
||||
MPIR_ERR_POP(mpi_errno);
|
||||
}
|
||||
MPL_DBG_MSG(MPIDI_CH3_DBG_CHANNEL, VERBOSE, ".... complete");
|
||||
goto fn_exit;
|
||||
}
|
||||
|
||||
complete = 0;
|
||||
mpi_errno = reqFn(vc, sreq, &complete);
|
||||
if (mpi_errno) MPIR_ERR_POP(mpi_errno);
|
||||
|
||||
if (complete) {
|
||||
/* request was completed by the OnDataAvail fn */
|
||||
MPL_DBG_MSG(MPIDI_CH3_DBG_CHANNEL, VERBOSE, ".... complete");
|
||||
goto fn_exit;
|
||||
}
|
||||
else {
|
||||
/* There is more data to send. */
|
||||
MPIR_Assert(("should never be incomplete!", 0));
|
||||
}
|
||||
|
||||
MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_MPID_NEM_LMT_DMA_DONE_SEND);
|
||||
fn_exit:
|
||||
return MPI_SUCCESS;
|
||||
fn_fail:
|
||||
goto fn_exit;
|
||||
}
|
||||
|
||||
/* called when a COOKIE message is received */
|
||||
int MPID_nem_lmt_dma_handle_cookie(MPIDI_VC_t *vc, MPIR_Request *req, MPL_IOV cookie)
|
||||
{
|
||||
int mpi_errno = MPI_SUCCESS;
|
||||
MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPID_NEM_LMT_DMA_HANDLE_COOKIE);
|
||||
|
||||
MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_MPID_NEM_LMT_DMA_HANDLE_COOKIE);
|
||||
|
||||
if (cookie.MPL_IOV_LEN == 0 && cookie.MPL_IOV_BUF == NULL) {
|
||||
/* req is a send request, we need to initiate another knem request and
|
||||
send a COOKIE message back to the receiver indicating the lid
|
||||
returned from the ioctl. */
|
||||
int complete;
|
||||
knem_cookie_t s_cookie;
|
||||
|
||||
/* This function will invoke the OnDataAvail function to load more data. */
|
||||
mpi_errno = check_req_complete(vc, req, &complete);
|
||||
if (mpi_errno) MPIR_ERR_POP(mpi_errno);
|
||||
|
||||
/* If we were complete we should have received a DONE message instead
|
||||
of a COOKIE message. */
|
||||
MPIR_Assert(!complete);
|
||||
|
||||
mpi_errno = do_dma_send(vc, req, req->dev.iov_count, &req->dev.iov[0], &s_cookie);
|
||||
if (mpi_errno) MPIR_ERR_POP(mpi_errno);
|
||||
MPID_nem_lmt_send_COOKIE(vc, req, &s_cookie, sizeof(knem_cookie_t));
|
||||
}
|
||||
else {
|
||||
/* req is a receive request and we need to continue receiving using the
|
||||
lid provided in the cookie iov. */
|
||||
mpi_errno = MPID_nem_lmt_dma_start_recv(vc, req, cookie);
|
||||
if (mpi_errno) MPIR_ERR_POP(mpi_errno);
|
||||
}
|
||||
|
||||
fn_fail:
|
||||
MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_MPID_NEM_LMT_DMA_HANDLE_COOKIE);
|
||||
return MPI_SUCCESS;
|
||||
}
|
||||
|
||||
int MPID_nem_lmt_dma_progress(void)
|
||||
{
|
||||
int mpi_errno = MPI_SUCCESS;
|
||||
struct lmt_dma_node *prev = NULL;
|
||||
struct lmt_dma_node *free_me = NULL;
|
||||
struct lmt_dma_node *cur = outstanding_head;
|
||||
MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPID_NEM_LMT_DMA_PROGRESS);
|
||||
|
||||
MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_MPID_NEM_LMT_DMA_PROGRESS);
|
||||
|
||||
/* Iterate over a linked-list of (req,status_idx)-tuples looking for
|
||||
completed/failed requests. Currently knem only provides status to the
|
||||
receiver, so all of these requests are recv requests. */
|
||||
while (cur) {
|
||||
switch (*cur->status_p) {
|
||||
case KNEM_STATUS_SUCCESS:
|
||||
{
|
||||
/* complete the request if all data has been sent, remove it from the list */
|
||||
int complete = 0;
|
||||
mpi_errno = check_req_complete(cur->vc, cur->req, &complete);
|
||||
if (mpi_errno) MPIR_ERR_POP(mpi_errno);
|
||||
|
||||
free_status_index(cur->status_p - knem_status);
|
||||
|
||||
if (complete) {
|
||||
/* request was completed by the OnDataAvail fn */
|
||||
MPID_nem_lmt_send_DONE(cur->vc, cur->req); /* tell the other side to complete its request */
|
||||
MPL_DBG_MSG(MPIDI_CH3_DBG_CHANNEL, VERBOSE, ".... complete");
|
||||
|
||||
}
|
||||
else {
|
||||
/* There is more data to send. We must inform the sender that we have
|
||||
completely received the current batch and that the next batch should
|
||||
be sent. */
|
||||
MPID_nem_lmt_send_COOKIE(cur->vc, cur->req, NULL, 0);
|
||||
}
|
||||
|
||||
/* Right now we always free the cur element, even if the
|
||||
request is incomplete because it simplifies the logic. */
|
||||
if (cur == outstanding_head) {
|
||||
outstanding_head = cur->next;
|
||||
prev = NULL;
|
||||
free_me = cur;
|
||||
cur = cur->next;
|
||||
}
|
||||
else {
|
||||
prev->next = cur->next;
|
||||
free_me = cur;
|
||||
cur = cur->next;
|
||||
}
|
||||
MPL_free(free_me);
|
||||
--MPID_nem_local_lmt_pending;
|
||||
continue;
|
||||
}
|
||||
break;
|
||||
case KNEM_STATUS_FAILED:
|
||||
/* set the error status for the request, complete it then dequeue the entry */
|
||||
cur->req->status.MPI_ERROR = MPI_SUCCESS;
|
||||
MPIR_ERR_SET1(cur->req->status.MPI_ERROR, MPI_ERR_OTHER, "**recv_status", "**recv_status %d", *cur->status_p);
|
||||
|
||||
mpi_errno = MPID_Request_complete(cur->req);
|
||||
if (mpi_errno != MPI_SUCCESS) {
|
||||
MPIR_ERR_POP(mpi_errno);
|
||||
}
|
||||
|
||||
if (cur == outstanding_head) {
|
||||
outstanding_head = cur->next;
|
||||
prev = NULL;
|
||||
free_me = cur;
|
||||
cur = cur->next;
|
||||
}
|
||||
else {
|
||||
prev->next = cur->next;
|
||||
free_me = cur;
|
||||
cur = cur->next;
|
||||
}
|
||||
|
||||
MPL_free(free_me);
|
||||
--MPID_nem_local_lmt_pending;
|
||||
continue;
|
||||
|
||||
break;
|
||||
case KNEM_STATUS_PENDING:
|
||||
/* nothing to do here */
|
||||
break;
|
||||
default:
|
||||
MPIR_ERR_SETANDJUMP1(mpi_errno, MPI_ERR_OTHER, "**invalid_knem_status",
|
||||
"**invalid_knem_status %d", *cur->status_p);
|
||||
break;
|
||||
}
|
||||
|
||||
prev = cur;
|
||||
cur = cur->next;
|
||||
}
|
||||
|
||||
fn_exit:
|
||||
MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_MPID_NEM_LMT_DMA_PROGRESS);
|
||||
return mpi_errno;
|
||||
fn_fail:
|
||||
goto fn_exit;
|
||||
}
|
||||
|
||||
int MPID_nem_lmt_dma_vc_terminated(MPIDI_VC_t *vc)
|
||||
{
|
||||
int mpi_errno = MPI_SUCCESS;
|
||||
MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPID_NEM_LMT_DMA_VC_TERMINATED);
|
||||
|
||||
MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_MPID_NEM_LMT_DMA_VC_TERMINATED);
|
||||
|
||||
/* Do nothing. KNEM should abort any ops with dead processes. */
|
||||
|
||||
fn_exit:
|
||||
MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_MPID_NEM_LMT_DMA_VC_TERMINATED);
|
||||
return mpi_errno;
|
||||
fn_fail:
|
||||
goto fn_exit;
|
||||
}
|
||||
|
||||
/* --------------------------------------------------------------------------
|
||||
The functions below are nops, stubs that might be used in later versions of
|
||||
this code.
|
||||
-------------------------------------------------------------------------- */
|
||||
|
||||
/* called when a CTS message is received */
|
||||
int MPID_nem_lmt_dma_start_send(MPIDI_VC_t *vc, MPIR_Request *req, MPL_IOV r_cookie)
|
||||
{
|
||||
int mpi_errno = MPI_SUCCESS;
|
||||
MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPID_NEM_LMT_DMA_START_SEND);
|
||||
|
||||
MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_MPID_NEM_LMT_DMA_START_SEND);
|
||||
|
||||
MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_MPID_NEM_LMT_DMA_START_SEND);
|
||||
return mpi_errno;
|
||||
}
|
||||
|
||||
/* called when a DONE message is received for a receive request */
|
||||
int MPID_nem_lmt_dma_done_recv(MPIDI_VC_t *vc, MPIR_Request *rreq)
|
||||
{
|
||||
MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPID_NEM_LMT_DMA_DONE_RECV);
|
||||
|
||||
MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_MPID_NEM_LMT_DMA_DONE_RECV);
|
||||
|
||||
MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_MPID_NEM_LMT_DMA_DONE_RECV);
|
||||
return MPI_SUCCESS;
|
||||
}
|
||||
|
||||
#endif /* HAVE_KNEM_IO_H */
|
||||
@@ -1,467 +0,0 @@
|
||||
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
|
||||
/*
|
||||
* (C) 2009 by Argonne National Laboratory.
|
||||
* See COPYRIGHT in top-level directory.
|
||||
*/
|
||||
#include "mpid_nem_impl.h"
|
||||
#include "mpid_nem_datatypes.h"
|
||||
|
||||
MPL_SUPPRESS_OSX_HAS_NO_SYMBOLS_WARNING;
|
||||
|
||||
#if defined(HAVE_VMSPLICE)
|
||||
|
||||
/* must come first for now */
|
||||
#define _GNU_SOURCE
|
||||
#include <fcntl.h>
|
||||
#include <sys/uio.h>
|
||||
|
||||
#include "mpid_nem_impl.h"
|
||||
#include "mpid_nem_datatypes.h"
|
||||
|
||||
|
||||
/* These are for maintaining a linked-list of outstanding requests on which we
|
||||
can make progress. */
|
||||
struct lmt_vmsplice_node {
|
||||
struct lmt_vmsplice_node *next;
|
||||
int pipe_fd;
|
||||
MPIR_Request *req;
|
||||
};
|
||||
|
||||
/* MT: this stack is not thread-safe */
|
||||
static struct lmt_vmsplice_node *outstanding_head = NULL;
|
||||
|
||||
/* Returns true if the IOV has been completely xferred, false otherwise.
|
||||
|
||||
iov_count and iov_offset are pointers so that this function can manipulate
|
||||
them */
|
||||
static int adjust_partially_xferred_iov(MPL_IOV iov[], int *iov_offset,
|
||||
int *iov_count, int bytes_xferred)
|
||||
{
|
||||
int i;
|
||||
int complete = 1;
|
||||
|
||||
for (i = *iov_offset; i < (*iov_offset + *iov_count); ++i)
|
||||
{
|
||||
if (bytes_xferred < iov[i].MPL_IOV_LEN)
|
||||
{
|
||||
iov[i].MPL_IOV_BUF = (char *)iov[i].MPL_IOV_BUF + bytes_xferred;
|
||||
iov[i].MPL_IOV_LEN -= bytes_xferred;
|
||||
/* iov_count should be equal to the number of iov's remaining */
|
||||
*iov_count -= (i - *iov_offset);
|
||||
*iov_offset = i;
|
||||
complete = 0;
|
||||
break;
|
||||
}
|
||||
bytes_xferred -= iov[i].MPL_IOV_LEN;
|
||||
}
|
||||
|
||||
return complete;
|
||||
}
|
||||
|
||||
static inline int check_req_complete(MPIDI_VC_t *vc, MPIR_Request *req, int *complete)
|
||||
{
|
||||
int mpi_errno = MPI_SUCCESS;
|
||||
int (*reqFn)(MPIDI_VC_t *, MPIR_Request *, int *);
|
||||
reqFn = req->dev.OnDataAvail;
|
||||
if (reqFn) {
|
||||
*complete = 0;
|
||||
|
||||
/* XXX DJG FIXME this feels like a hack */
|
||||
req->dev.iov_count = MPL_IOV_LIMIT;
|
||||
req->dev.iov_offset = 0;
|
||||
|
||||
mpi_errno = reqFn(vc, req, complete);
|
||||
if (mpi_errno) MPIR_ERR_POP(mpi_errno);
|
||||
}
|
||||
else {
|
||||
*complete = 1;
|
||||
mpi_errno = MPID_Request_complete(req);
|
||||
if (mpi_errno != MPI_SUCCESS) {
|
||||
MPIR_ERR_POP(mpi_errno);
|
||||
}
|
||||
}
|
||||
|
||||
fn_fail:
|
||||
return mpi_errno;
|
||||
}
|
||||
|
||||
/* fills in req->dev.iov{,_offset,_count} based on the datatype info in the
|
||||
request, creating a segment if necessary */
|
||||
static int populate_iov_from_req(MPIR_Request *req)
|
||||
{
|
||||
int mpi_errno = MPI_SUCCESS;
|
||||
int dt_contig;
|
||||
MPI_Aint dt_true_lb;
|
||||
intptr_t data_sz;
|
||||
MPIR_Datatype* dt_ptr;
|
||||
|
||||
/* find out contig/noncontig, size, and lb for the datatype */
|
||||
MPIDI_Datatype_get_info(req->dev.user_count, req->dev.datatype,
|
||||
dt_contig, data_sz, dt_ptr, dt_true_lb);
|
||||
|
||||
if (dt_contig) {
|
||||
/* handle the iov creation ourselves */
|
||||
req->dev.iov[0].MPL_IOV_BUF = (char *)req->dev.user_buf + dt_true_lb;
|
||||
req->dev.iov[0].MPL_IOV_LEN = data_sz;
|
||||
req->dev.iov_count = 1;
|
||||
}
|
||||
else {
|
||||
/* use the segment routines to handle the iovec creation */
|
||||
req->dev.iov_count = MPL_IOV_LIMIT;
|
||||
req->dev.iov_offset = 0;
|
||||
|
||||
req->dev.msg_offset = 0;
|
||||
req->dev.msgsize = data_sz;
|
||||
|
||||
/* FIXME we should write our own function that isn't dependent on
|
||||
the in-request iov array. This will let us use IOVs that are
|
||||
larger than MPL_IOV_LIMIT. */
|
||||
mpi_errno = MPIDI_CH3U_Request_load_send_iov(req, &req->dev.iov[0],
|
||||
&req->dev.iov_count);
|
||||
if (mpi_errno) MPIR_ERR_POP(mpi_errno);
|
||||
}
|
||||
|
||||
fn_fail:
|
||||
return mpi_errno;
|
||||
}
|
||||
|
||||
static int do_vmsplice(MPIR_Request *sreq, int pipe_fd, MPL_IOV iov[],
|
||||
int *iov_offset, int *iov_count, int *complete)
|
||||
{
|
||||
int mpi_errno = MPI_SUCCESS;
|
||||
ssize_t err;
|
||||
|
||||
#if 1
|
||||
err = vmsplice(pipe_fd, &iov[*iov_offset], *iov_count, SPLICE_F_NONBLOCK);
|
||||
#else
|
||||
err = writev(pipe_fd, &iov[*iov_offset], *iov_count);
|
||||
#endif
|
||||
|
||||
if (err < 0) {
|
||||
if (errno == EAGAIN) goto fn_exit;
|
||||
MPIR_ERR_CHKANDJUMP2(errno != EAGAIN, mpi_errno, MPI_ERR_OTHER, "**vmsplice",
|
||||
"**vmsplice %d %s", errno, MPIR_Strerror(errno));
|
||||
}
|
||||
|
||||
*complete = adjust_partially_xferred_iov(iov, iov_offset, iov_count, err);
|
||||
if (*complete) {
|
||||
/* look for additional data to send and reload IOV if there is more */
|
||||
mpi_errno = check_req_complete(sreq->ch.vc, sreq, complete);
|
||||
if (mpi_errno) MPIR_ERR_POP(mpi_errno);
|
||||
|
||||
if (*complete) {
|
||||
err = close(pipe_fd);
|
||||
MPIR_ERR_CHKANDJUMP(err < 0, mpi_errno, MPI_ERR_OTHER, "**close");
|
||||
MPL_DBG_MSG(MPIDI_CH3_DBG_CHANNEL, VERBOSE, ".... complete");
|
||||
}
|
||||
}
|
||||
|
||||
fn_fail:
|
||||
fn_exit:
|
||||
return mpi_errno;
|
||||
}
|
||||
|
||||
int MPID_nem_lmt_vmsplice_initiate_lmt(MPIDI_VC_t *vc, MPIDI_CH3_Pkt_t *pkt, MPIR_Request *sreq)
|
||||
{
|
||||
int mpi_errno = MPI_SUCCESS;
|
||||
MPID_nem_pkt_lmt_rts_t * const rts_pkt = (MPID_nem_pkt_lmt_rts_t *)pkt;
|
||||
MPIDI_CH3I_VC *vc_ch = &vc->ch;
|
||||
int complete = 0;
|
||||
MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPID_NEM_LMT_VMSPLICE_INITIATE_LMT);
|
||||
|
||||
MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_MPID_NEM_LMT_VMSPLICE_INITIATE_LMT);
|
||||
|
||||
/* re-use the same pipe per-pair,per-sender */
|
||||
if (vc_ch->lmt_copy_buf_handle == NULL) {
|
||||
int err;
|
||||
char *pipe_name;
|
||||
MPIDI_CH3I_VC *vc_ch = &vc->ch;
|
||||
|
||||
pipe_name = tempnam(NULL, "lmt_");
|
||||
MPIR_ERR_CHKANDJUMP2(!pipe_name, mpi_errno, MPI_ERR_OTHER, "**tempnam",
|
||||
"**tempnam %d %s", errno, MPIR_Strerror(errno));
|
||||
|
||||
vc_ch->lmt_copy_buf_handle = MPL_strdup(pipe_name);
|
||||
/* XXX DJG hack */
|
||||
#undef free
|
||||
free(pipe_name);
|
||||
|
||||
err = mkfifo(vc_ch->lmt_copy_buf_handle, 0660);
|
||||
MPIR_ERR_CHKANDJUMP2(err < 0, mpi_errno, MPI_ERR_OTHER, "**mkfifo",
|
||||
"**mkfifo %d %s", errno, MPIR_Strerror(errno));
|
||||
}
|
||||
|
||||
/* can't start sending data yet, need full RTS/CTS handshake */
|
||||
|
||||
MPID_nem_lmt_send_RTS(vc, rts_pkt, vc_ch->lmt_copy_buf_handle,
|
||||
strlen(vc_ch->lmt_copy_buf_handle)+1);
|
||||
|
||||
fn_fail:
|
||||
fn_exit:
|
||||
MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_MPID_NEM_LMT_VMSPLICE_INITIATE_LMT);
|
||||
return mpi_errno;
|
||||
}
|
||||
|
||||
static int do_readv(MPIR_Request *rreq, int pipe_fd, MPL_IOV iov[],
|
||||
int *iov_offset, int *iov_count, int *complete)
|
||||
{
|
||||
int mpi_errno = MPI_SUCCESS;
|
||||
ssize_t nread;
|
||||
|
||||
nread = readv(pipe_fd, &rreq->dev.iov[rreq->dev.iov_offset], rreq->dev.iov_count);
|
||||
MPIR_ERR_CHKANDJUMP2(nread < 0 && errno != EAGAIN, mpi_errno, MPI_ERR_OTHER, "**read",
|
||||
"**readv %d %s", errno, MPIR_Strerror(errno));
|
||||
|
||||
if (nread < 0) {
|
||||
if (errno == EAGAIN) goto fn_exit;
|
||||
MPIR_ERR_CHKANDJUMP2(errno != EAGAIN, mpi_errno, MPI_ERR_OTHER, "**vmsplice",
|
||||
"**vmsplice %d %s", errno, MPIR_Strerror(errno));
|
||||
}
|
||||
|
||||
*complete = adjust_partially_xferred_iov(iov, iov_offset, iov_count, nread);
|
||||
if (*complete) {
|
||||
/* look for additional data to send and reload IOV if there is more */
|
||||
mpi_errno = check_req_complete(rreq->ch.vc, rreq, complete);
|
||||
if (mpi_errno) MPIR_ERR_POP(mpi_errno);
|
||||
|
||||
if (*complete) {
|
||||
nread = close(pipe_fd);
|
||||
MPIR_ERR_CHKANDJUMP(nread < 0, mpi_errno, MPI_ERR_OTHER, "**close");
|
||||
MPL_DBG_MSG(MPIDI_CH3_DBG_CHANNEL, VERBOSE, ".... complete");
|
||||
}
|
||||
}
|
||||
|
||||
fn_fail:
|
||||
fn_exit:
|
||||
return mpi_errno;
|
||||
}
|
||||
|
||||
/* This function is called when an RTS message comes in. */
|
||||
int MPID_nem_lmt_vmsplice_start_recv(MPIDI_VC_t *vc, MPIR_Request *rreq, MPL_IOV s_cookie)
|
||||
{
|
||||
int mpi_errno = MPI_SUCCESS;
|
||||
int i;
|
||||
int complete = 0;
|
||||
struct lmt_vmsplice_node *node = NULL;
|
||||
MPIDI_CH3I_VC *vc_ch = &vc->ch;
|
||||
int pipe_fd;
|
||||
MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPID_NEM_LMT_VMSPLICE_START_RECV);
|
||||
|
||||
MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_MPID_NEM_LMT_VMSPLICE_START_RECV);
|
||||
|
||||
if (vc_ch->lmt_recv_copy_buf_handle == NULL) {
|
||||
MPIR_Assert(s_cookie.MPL_IOV_BUF != NULL);
|
||||
vc_ch->lmt_recv_copy_buf_handle = MPL_strdup(s_cookie.MPL_IOV_BUF);
|
||||
}
|
||||
|
||||
/* XXX DJG FIXME in a real version we would want to cache the fd on the vc
|
||||
so that we don't have two open's on the critical path every time. */
|
||||
pipe_fd = open(vc_ch->lmt_recv_copy_buf_handle, O_NONBLOCK|O_RDONLY);
|
||||
MPIR_ERR_CHKANDJUMP1(pipe_fd < 0, mpi_errno, MPI_ERR_OTHER, "**open",
|
||||
"**open %s", MPIR_Strerror(errno));
|
||||
|
||||
MPID_nem_lmt_send_CTS(vc, rreq, NULL, 0);
|
||||
|
||||
mpi_errno = populate_iov_from_req(rreq);
|
||||
if (mpi_errno) MPIR_ERR_POP(mpi_errno);
|
||||
|
||||
mpi_errno = do_readv(rreq, pipe_fd, rreq->dev.iov, &rreq->dev.iov_offset,
|
||||
&rreq->dev.iov_count, &complete);
|
||||
|
||||
/* push request if not complete for progress checks later */
|
||||
if (!complete) {
|
||||
node = MPL_malloc(sizeof(struct lmt_vmsplice_node), MPL_MEM_OTHER);
|
||||
node->pipe_fd = pipe_fd;
|
||||
node->req = rreq;
|
||||
node->next = outstanding_head;
|
||||
outstanding_head = node;
|
||||
++MPID_nem_local_lmt_pending;
|
||||
}
|
||||
|
||||
fn_exit:
|
||||
MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_MPID_NEM_LMT_VMSPLICE_START_RECV);
|
||||
return mpi_errno;
|
||||
fn_fail:
|
||||
goto fn_exit;
|
||||
}
|
||||
|
||||
/* XXX DJG FIXME at some point this should poll, much like the newtcp module.
|
||||
But then we have that whole pollfd array to manage, which we don't really
|
||||
need until this proof-of-concept proves itself. */
|
||||
int MPID_nem_lmt_vmsplice_progress(void)
|
||||
{
|
||||
int mpi_errno = MPI_SUCCESS;
|
||||
struct lmt_vmsplice_node *prev = NULL;
|
||||
struct lmt_vmsplice_node *free_me = NULL;
|
||||
struct lmt_vmsplice_node *cur = outstanding_head;
|
||||
MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPID_NEM_LMT_VMSPLICE_PROGRESS);
|
||||
|
||||
MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_MPID_NEM_LMT_VMSPLICE_PROGRESS);
|
||||
|
||||
while (cur) {
|
||||
int complete = 0;
|
||||
|
||||
switch (MPIDI_Request_get_type(cur->req)) {
|
||||
case MPIDI_REQUEST_TYPE_RECV:
|
||||
mpi_errno = do_readv(cur->req, cur->pipe_fd, cur->req->dev.iov,
|
||||
&cur->req->dev.iov_offset,
|
||||
&cur->req->dev.iov_count, &complete);
|
||||
/* FIXME: set the error status of the req and complete it, rather than POP */
|
||||
if (mpi_errno) MPIR_ERR_POP(mpi_errno);
|
||||
break;
|
||||
case MPIDI_REQUEST_TYPE_SEND:
|
||||
mpi_errno = do_vmsplice(cur->req, cur->pipe_fd, cur->req->dev.iov,
|
||||
&cur->req->dev.iov_offset,
|
||||
&cur->req->dev.iov_count, &complete);
|
||||
/* FIXME: set the error status of the req and complete it, rather than POP */
|
||||
if (mpi_errno) MPIR_ERR_POP(mpi_errno);
|
||||
break;
|
||||
default:
|
||||
MPIR_ERR_INTERNALANDJUMP(mpi_errno, "unexpected request type");
|
||||
break;
|
||||
}
|
||||
|
||||
if (complete) {
|
||||
MPL_DBG_MSG(MPIDI_CH3_DBG_CHANNEL, VERBOSE, ".... complete");
|
||||
|
||||
/* remove the node from the list */
|
||||
if (cur == outstanding_head) {
|
||||
outstanding_head = cur->next;
|
||||
prev = NULL;
|
||||
free_me = cur;
|
||||
cur = cur->next;
|
||||
}
|
||||
else {
|
||||
prev->next = cur->next;
|
||||
prev = cur;
|
||||
free_me = cur;
|
||||
cur = cur->next;
|
||||
}
|
||||
MPL_free(free_me);
|
||||
--MPID_nem_local_lmt_pending;
|
||||
}
|
||||
|
||||
if (!cur) break; /* we might have made cur NULL above */
|
||||
|
||||
prev = cur;
|
||||
cur = cur->next;
|
||||
}
|
||||
|
||||
fn_exit:
|
||||
MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_MPID_NEM_LMT_VMSPLICE_PROGRESS);
|
||||
return mpi_errno;
|
||||
fn_fail:
|
||||
goto fn_exit;
|
||||
}
|
||||
|
||||
/* called when a CTS message is received */
|
||||
int MPID_nem_lmt_vmsplice_start_send(MPIDI_VC_t *vc, MPIR_Request *sreq, MPL_IOV r_cookie)
|
||||
{
|
||||
int mpi_errno = MPI_SUCCESS;
|
||||
MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPID_NEM_LMT_VMSPLICE_START_SEND);
|
||||
int pipe_fd;
|
||||
int complete;
|
||||
struct lmt_vmsplice_node *node = NULL;
|
||||
int (*reqFn)(MPIDI_VC_t *, MPIR_Request *, int *);
|
||||
MPIDI_CH3I_VC *vc_ch = &vc->ch;
|
||||
|
||||
MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_MPID_NEM_LMT_VMSPLICE_START_SEND);
|
||||
|
||||
/* Must do this after the other side has opened for reading, otherwise we
|
||||
will error out with ENXIO. This will be indicated by the receipt of a
|
||||
CTS message. */
|
||||
pipe_fd = open(vc_ch->lmt_copy_buf_handle, O_NONBLOCK|O_WRONLY);
|
||||
MPIR_ERR_CHKANDJUMP1(pipe_fd < 0, mpi_errno, MPI_ERR_OTHER, "**open",
|
||||
"**open %s", MPIR_Strerror(errno));
|
||||
|
||||
mpi_errno = populate_iov_from_req(sreq);
|
||||
if (mpi_errno) MPIR_ERR_POP(mpi_errno);
|
||||
|
||||
/* send the first flight */
|
||||
sreq->ch.vc = vc; /* XXX DJG is this already assigned? */
|
||||
complete = 0;
|
||||
mpi_errno = do_vmsplice(sreq, pipe_fd, sreq->dev.iov,
|
||||
&sreq->dev.iov_offset, &sreq->dev.iov_count, &complete);
|
||||
if (mpi_errno) MPIR_ERR_POP(mpi_errno);
|
||||
|
||||
if (!complete) {
|
||||
/* push for later progress */
|
||||
node = MPL_malloc(sizeof(struct lmt_vmsplice_node), MPL_MEM_OTHER);
|
||||
node->pipe_fd = pipe_fd;
|
||||
node->req = sreq;
|
||||
node->next = outstanding_head;
|
||||
outstanding_head = node;
|
||||
++MPID_nem_local_lmt_pending;
|
||||
}
|
||||
|
||||
fn_fail:
|
||||
fn_exit:
|
||||
MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_MPID_NEM_LMT_VMSPLICE_START_SEND);
|
||||
return mpi_errno;
|
||||
}
|
||||
|
||||
int MPIDI_CH3_MPID_nem_lmt_vmsplice_vc_terminated(MPIDI_VC_t *vc)
|
||||
{
|
||||
int mpi_errno = MPI_SUCCESS;
|
||||
MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPIDI_CH3_MPID_NEM_LMT_VMSPLICE_VC_TERMINATED);
|
||||
|
||||
MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_MPIDI_CH3_MPID_NEM_LMT_VMSPLICE_VC_TERMINATED);
|
||||
|
||||
/* FIXME: need to handle the case where a VC is terminated due to
|
||||
a process failure. We need to remove any outstanding LMT ops
|
||||
for this VC. */
|
||||
|
||||
fn_exit:
|
||||
MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_MPIDI_CH3_MPID_NEM_LMT_VMSPLICE_VC_TERMINATED);
|
||||
return mpi_errno;
|
||||
fn_fail:
|
||||
goto fn_exit;
|
||||
}
|
||||
|
||||
|
||||
/* --------------------------------------------------------------------------
|
||||
The functions below are nops, stubs that might be used in later versions of
|
||||
this code.
|
||||
-------------------------------------------------------------------------- */
|
||||
|
||||
/* called when a DONE message is received for a receive request */
|
||||
int MPID_nem_lmt_vmsplice_done_recv(MPIDI_VC_t *vc, MPIR_Request *rreq)
|
||||
{
|
||||
MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPID_NEM_LMT_VMSPLICE_DONE_RECV);
|
||||
|
||||
MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_MPID_NEM_LMT_VMSPLICE_DONE_RECV);
|
||||
|
||||
/* nop */
|
||||
|
||||
MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_MPID_NEM_LMT_VMSPLICE_DONE_RECV);
|
||||
return MPI_SUCCESS;
|
||||
}
|
||||
|
||||
int MPID_nem_lmt_vmsplice_done_send(MPIDI_VC_t *vc, MPIR_Request *sreq)
|
||||
{
|
||||
int mpi_errno = MPI_SUCCESS;
|
||||
MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPID_NEM_LMT_VMSPLICE_DONE_SEND);
|
||||
|
||||
MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_MPID_NEM_LMT_VMSPLICE_DONE_SEND);
|
||||
|
||||
/* nop */
|
||||
|
||||
MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_MPID_NEM_LMT_VMSPLICE_DONE_SEND);
|
||||
return MPI_SUCCESS;
|
||||
}
|
||||
|
||||
/* called when a COOKIE message is received */
|
||||
int MPID_nem_lmt_vmsplice_handle_cookie(MPIDI_VC_t *vc, MPIR_Request *req, MPL_IOV cookie)
|
||||
{
|
||||
int mpi_errno = MPI_SUCCESS;
|
||||
MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPID_NEM_LMT_VMSPLICE_HANDLE_COOKIE);
|
||||
|
||||
MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_MPID_NEM_LMT_VMSPLICE_HANDLE_COOKIE);
|
||||
|
||||
/* nop */
|
||||
|
||||
MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_MPID_NEM_LMT_VMSPLICE_HANDLE_COOKIE);
|
||||
return MPI_SUCCESS;
|
||||
}
|
||||
|
||||
#endif
|
||||
@@ -246,36 +246,12 @@ if test "$pac_cv_have_struct_ifreq" = "yes" ; then
|
||||
AC_DEFINE(HAVE_STRUCT_IFREQ,1,[Define if struct ifreq can be used])
|
||||
fi
|
||||
|
||||
# Check for knem options
|
||||
AC_ARG_WITH(knem, [--with-knem=path - specify path where knem include directory can be found],
|
||||
if test "${with_knem}" != "yes" -a "${with_knem}" != "no" ; then
|
||||
CPPFLAGS="$CPPFLAGS -I${with_knem}/include"
|
||||
fi,)
|
||||
AC_ARG_WITH(knem-include, [--with-knem-include=path - specify path to knem include directory],
|
||||
if test "${with_knem_include}" != "yes" -a "${with_knem_include}" != "no" ; then
|
||||
CPPFLAGS="$CPPFLAGS -I${with_knem_include}"
|
||||
fi,)
|
||||
|
||||
AC_CHECK_HEADERS([knem_io.h], pac_cv_have_knem_io_h=yes,pac_cv_have_knem_io_h=no,)
|
||||
if test "${pac_cv_have_knem_io_h}" = yes ; then
|
||||
AC_DEFINE(HAVE_KNEM_IO_H,1,[Define if you have the <knem_io.h> header file.])
|
||||
fi
|
||||
|
||||
# allow the user to select different local LMT implementations
|
||||
AC_ARG_WITH(nemesis-local-lmt, [--with-nemesis-local-lmt=method - specify an implementation for local large message transfers (LMT). Method is one of: 'default', 'shm_copy', 'knem', or 'none'. 'default' is the same as 'shm_copy'.],,with_nemesis_local_lmt=default)
|
||||
AC_ARG_WITH(nemesis-local-lmt, [--with-nemesis-local-lmt=method - specify an implementation for local large message transfers (LMT). Method is one of: 'default', 'shm_copy', or 'none'. 'default' is the same as 'shm_copy'.],,with_nemesis_local_lmt=default)
|
||||
case "$with_nemesis_local_lmt" in
|
||||
shm_copy|default)
|
||||
local_lmt_impl=MPID_NEM_LOCAL_LMT_SHM_COPY
|
||||
;;
|
||||
dma|shm_dma|knem)
|
||||
if test "${pac_cv_have_knem_io_h}" != yes ; then
|
||||
AC_MSG_ERROR([Failed to find knem_io.h for nemesis-local-lmt=knem])
|
||||
fi
|
||||
local_lmt_impl=MPID_NEM_LOCAL_LMT_DMA
|
||||
;;
|
||||
vmsplice)
|
||||
local_lmt_impl=MPID_NEM_LOCAL_LMT_VMSPLICE
|
||||
;;
|
||||
none)
|
||||
local_lmt_impl=MPID_NEM_LOCAL_LMT_NONE
|
||||
;;
|
||||
|
||||
Referência em uma Nova Issue
Bloquear um usuário