git: fd490a171c3d - 2021Q3 - net/mpich: unbreak optimized runtime after 88e134883dd2

Jan Beich jbeich at FreeBSD.org
Mon Sep 6 22:26:16 UTC 2021


The branch 2021Q3 has been updated by jbeich:

URL: https://cgit.FreeBSD.org/ports/commit/?id=fd490a171c3da0d7bcb9a5f3ee3b4b46075dfa9e

commit fd490a171c3da0d7bcb9a5f3ee3b4b46075dfa9e
Author:     Henrik Gulbrandsen <henrik at gulbra.net>
AuthorDate: 2021-08-12 14:35:20 +0000
Commit:     Jan Beich <jbeich at FreeBSD.org>
CommitDate: 2021-09-06 22:25:57 +0000

    net/mpich: unbreak optimized runtime after 88e134883dd2
    
    Runtime may fail without a L0 driver like intel-compute-runtime e.g.,
    
    $ mpivars
    Abort(268484367) on node 0 (rank 0 in comm 0): Fatal error in PMPI_Init_thread: Other MPI error, error stack:
    MPIR_Init_thread(153):  gpu_init failed
    [unset]: write_line error; fd=-1 buf=:cmd=abort exitcode=268484367
    :
    system msg for write_line failure : Bad file descriptor
    Attempting to use an MPI routine before initializing MPICH
    
    $ MPIR_CVAR_ENABLE_GPU=0 mpivars
    Abort(2139535) on node 0 (rank 0 in comm 0): Fatal error in PMPI_Init_thread: Other MPI error, error stack:
    MPIR_Init_thread(159)......:
    MPID_Init(591).............:
    MPIDI_SHM_mpi_init_hook(22):
    MPIDI_IPC_mpi_init_hook(36):
    MPIDI_GPU_mpi_init_hook(79):  gpu_get_dev_count failed
    [unset]: write_line error; fd=-1 buf=:cmd=abort exitcode=2139535
    :
    system msg for write_line failure : Bad file descriptor
    Abort(2139535) on node 0 (rank 0 in comm 0): Fatal error in PMPI_Init_thread: Other MPI error, error stack:
    MPIR_Init_thread(159)......:
    MPID_Init(591).............:
    MPIDI_SHM_mpi_init_hook(22):
    MPIDI_IPC_mpi_init_hook(36):
    MPIDI_GPU_mpi_init_hook(79):  gpu_get_dev_count failed
    [unset]: write_line error; fd=-1 buf=:cmd=abort exitcode=2139535
    :
    system msg for write_line failure : Bad file descriptor
    Segmentation fault
    
    PR:             256244 (for tracking)
    (cherry picked from commit b5815e7648a8e5307a20a234befa00e34306319d)
---
 net/mpich/Makefile                |  2 +-
 net/mpich/files/patch-l0-fallback | 44 +++++++++++++++++++++++++++++++++++++++
 2 files changed, 45 insertions(+), 1 deletion(-)

diff --git a/net/mpich/Makefile b/net/mpich/Makefile
index 9741b1b75d7f..295897406b27 100644
--- a/net/mpich/Makefile
+++ b/net/mpich/Makefile
@@ -1,6 +1,6 @@
 PORTNAME=	mpich
 PORTVERSION=	3.4.2
-PORTREVISION=	2
+PORTREVISION=	3
 CATEGORIES=	net parallel
 MASTER_SITES=	https://www.mpich.org/static/downloads/${DISTVERSION}/
 
diff --git a/net/mpich/files/patch-l0-fallback b/net/mpich/files/patch-l0-fallback
new file mode 100644
index 000000000000..35f18dc272a5
--- /dev/null
+++ b/net/mpich/files/patch-l0-fallback
@@ -0,0 +1,44 @@
+$ pkg delete intel-compute-runtime
+$ mpivars
+PCI: Failed to initialize libpciaccess with pci_system_init(): 6 (Permission denied)
+Abort(268484367) on node 0 (rank 0 in comm 0): Fatal error in PMPI_Init_thread: Other MPI error, error stack:
+MPIR_Init_thread(153):  gpu_init failed
+[unset]: write_line error; fd=-1 buf=:cmd=abort exitcode=268484367
+:
+system msg for write_line failure : Bad file descriptor
+Attempting to use an MPI routine before initializing MPICH
+
+--- src/mpi/init/initthread.c.orig	2021-05-25 17:37:05 UTC
++++ src/mpi/init/initthread.c
+@@ -150,7 +150,9 @@ int MPIR_Init_thread(int *argc, char ***argv, int user
+      * inside MPID_Init */
+     if (MPIR_CVAR_ENABLE_GPU) {
+         int mpl_errno = MPL_gpu_init();
+-        MPIR_ERR_CHKANDJUMP(mpl_errno != MPL_SUCCESS, mpi_errno, MPI_ERR_OTHER, "**gpu_init");
++        MPIR_ERR_CHKANDJUMP(
++            mpl_errno != MPL_SUCCESS && mpl_errno != MPL_ERR_GPU_INTERNAL,
++            mpi_errno, MPI_ERR_OTHER, "**gpu_init");
+     }
+ 
+     MPL_atomic_store_int(&MPIR_Process.mpich_state, MPICH_MPI_STATE__IN_INIT);
+--- src/mpid/ch4/netmod/ofi/ofi_init.c.orig	2021-05-25 17:37:05 UTC
++++ src/mpid/ch4/netmod/ofi/ofi_init.c
+@@ -731,7 +731,6 @@ int MPIDI_OFI_mpi_init_hook(int rank, int size, int ap
+             MPL_gpu_malloc_host(&(MPIDI_OFI_global.am_bufs[i]), MPIDI_OFI_AM_BUFF_SZ);
+             MPIDI_OFI_global.am_reqs[i].event_id = MPIDI_OFI_EVENT_AM_RECV;
+             MPIDI_OFI_global.am_reqs[i].index = i;
+-            MPIR_Assert(MPIDI_OFI_global.am_bufs[i]);
+             MPIDI_OFI_global.am_iov[i].iov_base = MPIDI_OFI_global.am_bufs[i];
+             MPIDI_OFI_global.am_iov[i].iov_len = MPIDI_OFI_AM_BUFF_SZ;
+             MPIDI_OFI_global.am_msg[i].msg_iov = &MPIDI_OFI_global.am_iov[i];
+--- src/mpl/src/gpu/mpl_gpu_ze.c.orig	2021-05-25 17:37:05 UTC
++++ src/mpl/src/gpu/mpl_gpu_ze.c
+@@ -33,7 +33,7 @@ int MPL_gpu_get_dev_count(int *dev_cnt, int *dev_id)
+ {
+     int ret = MPL_SUCCESS;
+     if (!gpu_initialized) {
+-        ret = MPL_gpu_init();
++        MPL_gpu_init();
+     }
+ 
+     *dev_cnt = device_count;


More information about the dev-commits-ports-all mailing list