From 7fcaded3a4d03670518bb8a41e9e80770caed09a Mon Sep 17 00:00:00 2001 From: Caio Rocha Date: Fri, 22 May 2026 21:18:13 +0000 Subject: [PATCH 01/11] wip --- .azure-pipelines/multi-nodes-test.yml | 14 + .../algos/multi_node_transfer.py | 85 ++++ .../algos/multi_node_transfer_pkt.py | 70 ++++ .../execution-plans/multi_node_transfer.json | 369 ++++++++++++++++++ .../multi_node_transfer_pkt.json | 239 ++++++++++++ 5 files changed, 777 insertions(+) create mode 100644 test/executor-tests/algos/multi_node_transfer.py create mode 100644 test/executor-tests/algos/multi_node_transfer_pkt.py create mode 100644 test/executor-tests/execution-plans/multi_node_transfer.json create mode 100644 test/executor-tests/execution-plans/multi_node_transfer_pkt.json diff --git a/.azure-pipelines/multi-nodes-test.yml b/.azure-pipelines/multi-nodes-test.yml index 3b3ebe1ff..0a9028d23 100644 --- a/.azure-pipelines/multi-nodes-test.yml +++ b/.azure-pipelines/multi-nodes-test.yml @@ -117,6 +117,20 @@ jobs: remoteScript: | bash /root/mscclpp/test/deploy/run_tests.sh py-benchmark + - template: templates/run-remote-task.yml + parameters: + name: RunMultiNodeExecutorTests + displayName: Run multi-nodes executor tests + runRemoteArgs: '--hostfile $(System.DefaultWorkingDirectory)/test/deploy/hostfile --host ${{ parameters.vmssName }}000000 --user azureuser' + remoteScript: | + HOSTFILE=/root/mscclpp/test/deploy/hostfile_mpi + PLANS_DIR=/root/mscclpp/test/executor-tests/execution-plans + TEST_SCRIPT=/root/mscclpp/python/test/executor_test.py + MPI_ARGS="--allow-run-as-root --bind-to numa -hostfile ${HOSTFILE} -mca btl_tcp_if_include eth0" + MSCCLPP_ENV="-x MSCCLPP_DEBUG=WARN -x MSCCLPP_SOCKET_IFNAME=eth0 -x LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH" + mpirun ${MPI_ARGS} -np 2 -npernode 1 ${MSCCLPP_ENV} python3 $TEST_SCRIPT -path $PLANS_DIR/multi_node_transfer.json --size 1M --in_place + mpirun ${MPI_ARGS} -np 2 -npernode 1 ${MSCCLPP_ENV} python3 $TEST_SCRIPT -path $PLANS_DIR/multi_node_transfer_pkt.json --size 1M --in_place + - template: templates/stop.yml parameters: subscription: mscclpp-ci-h100 diff --git a/test/executor-tests/algos/multi_node_transfer.py b/test/executor-tests/algos/multi_node_transfer.py new file mode 100644 index 000000000..2fff1e7a4 --- /dev/null +++ b/test/executor-tests/algos/multi_node_transfer.py @@ -0,0 +1,85 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + +""" +Multi-Node Transfer Test + +This file tests the SIGNAL, WAIT, PUT, PUT_WITH_SIGNAL and +PUT_WITH_SIGNAL_AND_FLUSH operations on PortChannels in a multi-node +environment. It implements a 2-GPU allgather using the Simple protocol, +exercising the different port-channel synchronization primitives. +""" + +import argparse +from mscclpp.language.channel import * +from mscclpp.language.rank import * +from mscclpp.language.general import * +from mscclpp.language.program import * +from mscclpp.language.collectives import * + + +def multi_node_transfer(name, num_threads_per_block, min_message_size, max_message_size): + chunksperloop = 2 + gpu_size = 2 + collective = AllGather(gpu_size, chunksperloop, True) + with CollectiveProgram( + name, + collective, + gpu_size, + protocol="Simple", + num_threads_per_block=num_threads_per_block, + use_double_scratch_buffer=False, + min_message_size=min_message_size, + max_message_size=max_message_size, + ): + # Setup ranks, channels, output and scratch buffers for 2-GPU allgather + first_rank = Rank(0) + second_rank = Rank(1) + first_ch1 = PortChannel(1, 0) + second_ch1 = PortChannel(0, 1) + first_ch2 = PortChannel(1, 0) + second_ch2 = PortChannel(0, 1) + first_output_buffer = first_rank.get_output_buffer() + second_output_buffer = second_rank.get_output_buffer() + + # Initial handshake on both port channels: peers exchange SIGNAL/WAIT to + # ensure remote buffers are ready before any data transfer begins. + first_ch1.signal(tb=0) + second_ch1.signal(tb=0) + first_ch1.wait(tb=0) + second_ch1.wait(tb=0) + first_ch2.signal(tb=1) + second_ch2.signal(tb=1) + first_ch2.wait(tb=1) + second_ch2.wait(tb=1) + + # Rank 0 -> rank 1 via ch1: PUT followed by an explicit SIGNAL and FLUSH + first_ch1.put(second_output_buffer[0:1], first_output_buffer[0:1], tb=0) + first_ch1.signal(tb=0) + first_ch1.flush(tb=0) + # Rank 0 -> rank 1 via ch2: PUT_WITH_SIGNAL fuses the data transfer with + # the completion signal, followed by a separate FLUSH + first_ch2.put_with_signal(second_output_buffer[1:2], first_output_buffer[1:2], tb=1) + first_ch2.flush(tb=1) + # Rank 1 -> rank 0 via ch1: PUT_WITH_SIGNAL_AND_FLUSH fuses PUT, SIGNAL + # and FLUSH into a single operation + second_ch1.put_with_signal_and_flush(first_output_buffer[2:4], second_output_buffer[2:4], tb=0) + + # Final WAITs ensure all incoming transfers have completed on each rank + first_ch1.wait(tb=0) + second_ch1.wait(tb=0) + second_ch2.wait(tb=1) + + print(JSON()) + + +parser = argparse.ArgumentParser() + +parser.add_argument("--name", type=str, help="name of the program") +parser.add_argument("--num_threads_per_block", type=int, default=1024, help="number of threads per block") +parser.add_argument("--min_message_size", type=int, default=0, help="minimum message size") +parser.add_argument("--max_message_size", type=int, default=2**64 - 1, help="maximum message size") + +args = parser.parse_args() + +multi_node_transfer(args.name, args.num_threads_per_block, args.min_message_size, args.max_message_size) diff --git a/test/executor-tests/algos/multi_node_transfer_pkt.py b/test/executor-tests/algos/multi_node_transfer_pkt.py new file mode 100644 index 000000000..6924a5cbc --- /dev/null +++ b/test/executor-tests/algos/multi_node_transfer_pkt.py @@ -0,0 +1,70 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + +""" +Multi-Node Transfer Pack Test + +This file tests the PUT_PACKETS and READ_PUT_PACKETS operations on +PortChannels in a multi-node environment. It implements a 2-GPU allgather +with the LL (low-latency) packet protocol, using port channels for inter-node +communication. +""" + +import argparse +from mscclpp.language.channel import * +from mscclpp.language.rank import * +from mscclpp.language.general import * +from mscclpp.language.program import * +from mscclpp.language.collectives import * + + +def multi_node_transfer_pkt(name, num_threads_per_block, min_message_size, max_message_size): + chunksperloop = 1 + gpu_size = 2 + collective = AllGather(gpu_size, chunksperloop, True) + with CollectiveProgram( + name, + collective, + gpu_size, + protocol="LL", + num_threads_per_block=num_threads_per_block, + use_double_scratch_buffer=True, + min_message_size=min_message_size, + max_message_size=max_message_size, + ): + # Setup ranks, channels, output and scratch buffers for 2-GPU allgather + first_rank = Rank(0) + second_rank = Rank(1) + first_ch = PortChannel(1, 0) + second_ch = PortChannel(0, 1) + first_output_buffer = first_rank.get_output_buffer() + second_output_buffer = second_rank.get_output_buffer() + first_scratch_buffer = Buffer(0, 2) + second_scratch_buffer = Buffer(1, 2) + + # Each rank stages its own output chunk into its local scratch buffer as packets + first_rank.copy_packets(first_scratch_buffer[0:1], first_output_buffer[0:1], tb=0) + second_rank.copy_packets(second_scratch_buffer[1:2], second_output_buffer[1:2], tb=0) + + # Rank 0 pushes packets to rank 1's scratch via port channel (PUT_PACKETS) + # Rank 1 reads from rank 0's scratch and pushes packets back via port channel (READ_PUT_PACKETS) + first_ch.put_packets(second_scratch_buffer[0:1], first_scratch_buffer[0:1], tb=0) + second_ch.read_put_packets(first_scratch_buffer[1:2], second_scratch_buffer[1:2], tb=1) + + # Both ranks unpack received packets from scratch into their output buffers + first_rank.unpack_packets(first_output_buffer[1:2], first_scratch_buffer[1:2], tb=1) + second_rank.unpack_packets(second_output_buffer[0:1], second_scratch_buffer[0:1], tb=2) + + print(JSON()) + + +parser = argparse.ArgumentParser() + +parser.add_argument("--name", type=str, help="name of the program") +parser.add_argument("--num_threads_per_block", type=int, default=1024, help="number of threads per block") +parser.add_argument("--min_message_size", type=int, default=0, help="minimum message size") +parser.add_argument("--max_message_size", type=int, default=2**64 - 1, help="maximum message size") + +args = parser.parse_args() + +multi_node_transfer_pkt(args.name, args.num_threads_per_block, args.min_message_size, args.max_message_size) diff --git a/test/executor-tests/execution-plans/multi_node_transfer.json b/test/executor-tests/execution-plans/multi_node_transfer.json new file mode 100644 index 000000000..a08ec870e --- /dev/null +++ b/test/executor-tests/execution-plans/multi_node_transfer.json @@ -0,0 +1,369 @@ +{ + "name": "multi_node_transfer", + "collective": "allgather", + "protocol": "Simple", + "inplace": true, + "reuse_resources": false, + "gpus": [ + { + "id": 0, + "input_chunks": 2, + "output_chunks": 4, + "scratch_chunks": 0, + "threadblocks": [ + { + "id": 0, + "ops": [ + { + "name": "nop" + }, + { + "name": "signal", + "channel_ids": [ + 0 + ], + "channel_type": "port" + }, + { + "name": "nop" + }, + { + "name": "wait", + "channel_ids": [ + 0 + ], + "channel_type": "port" + }, + { + "name": "nop" + }, + { + "name": "put", + "src_buff": [ + { + "type": "o", + "index": 0, + "size": 1 + } + ], + "dst_buff": [ + { + "buffer_id": 0, + "index": 0, + "size": 1 + } + ], + "channel_ids": [ + 0 + ], + "channel_type": "port" + }, + { + "name": "nop" + }, + { + "name": "signal", + "channel_ids": [ + 0 + ], + "channel_type": "port" + }, + { + "name": "nop" + }, + { + "name": "flush", + "channel_ids": [ + 0 + ], + "channel_type": "port" + }, + { + "name": "nop" + }, + { + "name": "wait", + "channel_ids": [ + 0 + ], + "channel_type": "port" + }, + { + "name": "nop" + } + ], + "channels": [ + { + "channel_type": "port", + "channel_ids": [ + 0 + ] + } + ], + "remote_buffer_refs": [ + { + "access_channel_type": "port", + "remote_buffer_ids": [ + 0 + ] + } + ] + }, + { + "id": 1, + "ops": [ + { + "name": "nop" + }, + { + "name": "signal", + "channel_ids": [ + 0 + ], + "channel_type": "port" + }, + { + "name": "nop" + }, + { + "name": "wait", + "channel_ids": [ + 0 + ], + "channel_type": "port" + }, + { + "name": "nop" + }, + { + "name": "pws", + "src_buff": [ + { + "type": "o", + "index": 1, + "size": 1 + } + ], + "dst_buff": [ + { + "buffer_id": 0, + "index": 1, + "size": 1 + } + ], + "channel_ids": [ + 0 + ], + "channel_type": "port" + }, + { + "name": "nop" + }, + { + "name": "flush", + "channel_ids": [ + 0 + ], + "channel_type": "port" + }, + { + "name": "nop" + } + ], + "channels": [ + { + "channel_type": "port", + "channel_ids": [ + 1 + ] + } + ], + "remote_buffer_refs": [ + { + "access_channel_type": "port", + "remote_buffer_ids": [ + 0 + ] + } + ] + } + ], + "channels": [ + { + "channel_type": "port", + "connected_to": [ + 1, + 1 + ] + } + ], + "remote_buffers": [ + { + "rank": 1, + "type": "o", + "access_channel_types": [ + "port" + ] + } + ], + "semaphores": [] + }, + { + "id": 1, + "input_chunks": 2, + "output_chunks": 4, + "scratch_chunks": 0, + "threadblocks": [ + { + "id": 0, + "ops": [ + { + "name": "nop" + }, + { + "name": "signal", + "channel_ids": [ + 0 + ], + "channel_type": "port" + }, + { + "name": "nop" + }, + { + "name": "wait", + "channel_ids": [ + 0 + ], + "channel_type": "port" + }, + { + "name": "nop" + }, + { + "name": "pwsf", + "src_buff": [ + { + "type": "o", + "index": 2, + "size": 2 + } + ], + "dst_buff": [ + { + "buffer_id": 0, + "index": 2, + "size": 2 + } + ], + "channel_ids": [ + 0 + ], + "channel_type": "port" + }, + { + "name": "nop" + }, + { + "name": "wait", + "channel_ids": [ + 0 + ], + "channel_type": "port" + }, + { + "name": "nop" + } + ], + "channels": [ + { + "channel_type": "port", + "channel_ids": [ + 0 + ] + } + ], + "remote_buffer_refs": [ + { + "access_channel_type": "port", + "remote_buffer_ids": [ + 0 + ] + } + ] + }, + { + "id": 1, + "ops": [ + { + "name": "nop" + }, + { + "name": "signal", + "channel_ids": [ + 0 + ], + "channel_type": "port" + }, + { + "name": "nop" + }, + { + "name": "wait", + "channel_ids": [ + 0 + ], + "channel_type": "port" + }, + { + "name": "nop" + }, + { + "name": "wait", + "channel_ids": [ + 0 + ], + "channel_type": "port" + }, + { + "name": "nop" + } + ], + "channels": [ + { + "channel_type": "port", + "channel_ids": [ + 1 + ] + } + ], + "remote_buffer_refs": [] + } + ], + "channels": [ + { + "channel_type": "port", + "connected_to": [ + 0, + 0 + ] + } + ], + "remote_buffers": [ + { + "rank": 0, + "type": "o", + "access_channel_types": [ + "port" + ] + } + ], + "semaphores": [] + } + ], + "num_threads_per_block": 1024, + "use_double_scratch_buffer": false, + "buffer_alignment": 16, + "min_message_size": 0, + "max_message_size": 18446744073709551615 +} diff --git a/test/executor-tests/execution-plans/multi_node_transfer_pkt.json b/test/executor-tests/execution-plans/multi_node_transfer_pkt.json new file mode 100644 index 000000000..677d3dfc8 --- /dev/null +++ b/test/executor-tests/execution-plans/multi_node_transfer_pkt.json @@ -0,0 +1,239 @@ +{ + "name": "multi_node_transfer", + "collective": "allgather", + "protocol": "LL", + "inplace": true, + "reuse_resources": false, + "gpus": [ + { + "id": 0, + "input_chunks": 1, + "output_chunks": 2, + "scratch_chunks": 2, + "threadblocks": [ + { + "id": 0, + "ops": [ + { + "name": "cpkt", + "src_buff": [ + { + "type": "o", + "index": 0, + "size": 1 + } + ], + "dst_buff": [ + { + "type": "s", + "index": 0, + "size": 1 + } + ] + }, + { + "name": "ppkt", + "src_buff": [ + { + "type": "s", + "index": 0, + "size": 1 + } + ], + "dst_buff": [ + { + "buffer_id": 0, + "index": 0, + "size": 1 + } + ], + "channel_ids": [ + 0 + ], + "channel_type": "port" + } + ], + "channels": [ + { + "channel_type": "port", + "channel_ids": [ + 0 + ] + } + ], + "remote_buffer_refs": [ + { + "access_channel_type": "port", + "remote_buffer_ids": [ + 0 + ] + } + ] + }, + { + "id": 1, + "ops": [ + { + "name": "upkt", + "src_buff": [ + { + "type": "s", + "index": 1, + "size": 1 + } + ], + "dst_buff": [ + { + "type": "o", + "index": 1, + "size": 1 + } + ] + } + ], + "channels": [], + "remote_buffer_refs": [] + } + ], + "channels": [ + { + "channel_type": "port", + "connected_to": [ + 1 + ] + } + ], + "remote_buffers": [ + { + "rank": 1, + "type": "s", + "access_channel_types": [ + "port" + ] + } + ], + "semaphores": [] + }, + { + "id": 1, + "input_chunks": 1, + "output_chunks": 2, + "scratch_chunks": 2, + "threadblocks": [ + { + "id": 0, + "ops": [ + { + "name": "cpkt", + "src_buff": [ + { + "type": "o", + "index": 1, + "size": 1 + } + ], + "dst_buff": [ + { + "type": "s", + "index": 1, + "size": 1 + } + ] + } + ], + "channels": [], + "remote_buffer_refs": [] + }, + { + "id": 1, + "ops": [ + { + "name": "rppkt", + "src_buff": [ + { + "type": "s", + "index": 1, + "size": 1 + } + ], + "dst_buff": [ + { + "buffer_id": 0, + "index": 1, + "size": 1 + } + ], + "channel_ids": [ + 0 + ], + "channel_type": "port" + } + ], + "channels": [ + { + "channel_type": "port", + "channel_ids": [ + 0 + ] + } + ], + "remote_buffer_refs": [ + { + "access_channel_type": "port", + "remote_buffer_ids": [ + 0 + ] + } + ] + }, + { + "id": 2, + "ops": [ + { + "name": "upkt", + "src_buff": [ + { + "type": "s", + "index": 0, + "size": 1 + } + ], + "dst_buff": [ + { + "type": "o", + "index": 0, + "size": 1 + } + ] + } + ], + "channels": [], + "remote_buffer_refs": [] + } + ], + "channels": [ + { + "channel_type": "port", + "connected_to": [ + 0 + ] + } + ], + "remote_buffers": [ + { + "rank": 0, + "type": "s", + "access_channel_types": [ + "port" + ] + } + ], + "semaphores": [] + } + ], + "num_threads_per_block": 1024, + "use_double_scratch_buffer": true, + "buffer_alignment": 16, + "min_message_size": 0, + "max_message_size": 18446744073709551615 +} From 105cc34ab7ed137f760e6644c4f340c4a9c79299 Mon Sep 17 00:00:00 2001 From: Caio Rocha Date: Fri, 22 May 2026 21:29:29 +0000 Subject: [PATCH 02/11] testing multinodes executes test isolated --- .azure-pipelines/multi-nodes-test.yml | 64 +++++++++++++-------------- 1 file changed, 32 insertions(+), 32 deletions(-) diff --git a/.azure-pipelines/multi-nodes-test.yml b/.azure-pipelines/multi-nodes-test.yml index 0a9028d23..b2d710400 100644 --- a/.azure-pipelines/multi-nodes-test.yml +++ b/.azure-pipelines/multi-nodes-test.yml @@ -84,38 +84,38 @@ jobs: resourceGroup: mscclpp gpuArch: '90' - - template: templates/run-remote-task.yml - parameters: - name: RunMscclppTest - displayName: Run multi-nodes mscclpp-test - continueOnError: true - runRemoteArgs: '--hostfile $(System.DefaultWorkingDirectory)/test/deploy/hostfile --host ${{ parameters.vmssName }}000000 --user azureuser' - remoteScript: | - bash /root/mscclpp/test/deploy/run_tests.sh mscclpp-test - - - template: templates/run-remote-task.yml - parameters: - name: RunMultiNodeUnitTest - displayName: Run multi-nodes unit tests - runRemoteArgs: '--hostfile $(System.DefaultWorkingDirectory)/test/deploy/hostfile --host ${{ parameters.vmssName }}000000 --user azureuser' - remoteScript: | - bash /root/mscclpp/test/deploy/run_tests.sh mp-ut - - - template: templates/run-remote-task.yml - parameters: - name: RunMultiNodePythonTests - displayName: Run multi-nodes python tests - runRemoteArgs: '--hostfile $(System.DefaultWorkingDirectory)/test/deploy/hostfile --host ${{ parameters.vmssName }}000000 --user azureuser' - remoteScript: | - bash /root/mscclpp/test/deploy/run_tests.sh pytests - - - template: templates/run-remote-task.yml - parameters: - name: RunMultiNodePythonBenchmark - displayName: Run multi-nodes python benchmark - runRemoteArgs: '--hostfile $(System.DefaultWorkingDirectory)/test/deploy/hostfile --host ${{ parameters.vmssName }}000000 --user azureuser' - remoteScript: | - bash /root/mscclpp/test/deploy/run_tests.sh py-benchmark + # - template: templates/run-remote-task.yml + # parameters: + # name: RunMscclppTest + # displayName: Run multi-nodes mscclpp-test + # continueOnError: true + # runRemoteArgs: '--hostfile $(System.DefaultWorkingDirectory)/test/deploy/hostfile --host ${{ parameters.vmssName }}000000 --user azureuser' + # remoteScript: | + # bash /root/mscclpp/test/deploy/run_tests.sh mscclpp-test + + # - template: templates/run-remote-task.yml + # parameters: + # name: RunMultiNodeUnitTest + # displayName: Run multi-nodes unit tests + # runRemoteArgs: '--hostfile $(System.DefaultWorkingDirectory)/test/deploy/hostfile --host ${{ parameters.vmssName }}000000 --user azureuser' + # remoteScript: | + # bash /root/mscclpp/test/deploy/run_tests.sh mp-ut + + # - template: templates/run-remote-task.yml + # parameters: + # name: RunMultiNodePythonTests + # displayName: Run multi-nodes python tests + # runRemoteArgs: '--hostfile $(System.DefaultWorkingDirectory)/test/deploy/hostfile --host ${{ parameters.vmssName }}000000 --user azureuser' + # remoteScript: | + # bash /root/mscclpp/test/deploy/run_tests.sh pytests + + # - template: templates/run-remote-task.yml + # parameters: + # name: RunMultiNodePythonBenchmark + # displayName: Run multi-nodes python benchmark + # runRemoteArgs: '--hostfile $(System.DefaultWorkingDirectory)/test/deploy/hostfile --host ${{ parameters.vmssName }}000000 --user azureuser' + # remoteScript: | + # bash /root/mscclpp/test/deploy/run_tests.sh py-benchmark - template: templates/run-remote-task.yml parameters: From 21e93601e2c0093e7aed9dc86d741bd859f5272f Mon Sep 17 00:00:00 2001 From: Caio Rocha Date: Tue, 26 May 2026 16:43:08 +0000 Subject: [PATCH 03/11] installing python --- .azure-pipelines/multi-nodes-test.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.azure-pipelines/multi-nodes-test.yml b/.azure-pipelines/multi-nodes-test.yml index b2d710400..7797f9e61 100644 --- a/.azure-pipelines/multi-nodes-test.yml +++ b/.azure-pipelines/multi-nodes-test.yml @@ -123,6 +123,7 @@ jobs: displayName: Run multi-nodes executor tests runRemoteArgs: '--hostfile $(System.DefaultWorkingDirectory)/test/deploy/hostfile --host ${{ parameters.vmssName }}000000 --user azureuser' remoteScript: | + python3 -m pip install . HOSTFILE=/root/mscclpp/test/deploy/hostfile_mpi PLANS_DIR=/root/mscclpp/test/executor-tests/execution-plans TEST_SCRIPT=/root/mscclpp/python/test/executor_test.py From 875b551db22f9833e752c9341933316af411ca55 Mon Sep 17 00:00:00 2001 From: Caio Rocha Date: Tue, 26 May 2026 18:21:27 +0000 Subject: [PATCH 04/11] using venv --- .azure-pipelines/multi-nodes-test.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.azure-pipelines/multi-nodes-test.yml b/.azure-pipelines/multi-nodes-test.yml index 7797f9e61..cf23daaf6 100644 --- a/.azure-pipelines/multi-nodes-test.yml +++ b/.azure-pipelines/multi-nodes-test.yml @@ -123,14 +123,14 @@ jobs: displayName: Run multi-nodes executor tests runRemoteArgs: '--hostfile $(System.DefaultWorkingDirectory)/test/deploy/hostfile --host ${{ parameters.vmssName }}000000 --user azureuser' remoteScript: | - python3 -m pip install . HOSTFILE=/root/mscclpp/test/deploy/hostfile_mpi PLANS_DIR=/root/mscclpp/test/executor-tests/execution-plans TEST_SCRIPT=/root/mscclpp/python/test/executor_test.py + PYTHON_BIN=/root/venv/bin/python3 MPI_ARGS="--allow-run-as-root --bind-to numa -hostfile ${HOSTFILE} -mca btl_tcp_if_include eth0" MSCCLPP_ENV="-x MSCCLPP_DEBUG=WARN -x MSCCLPP_SOCKET_IFNAME=eth0 -x LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH" - mpirun ${MPI_ARGS} -np 2 -npernode 1 ${MSCCLPP_ENV} python3 $TEST_SCRIPT -path $PLANS_DIR/multi_node_transfer.json --size 1M --in_place - mpirun ${MPI_ARGS} -np 2 -npernode 1 ${MSCCLPP_ENV} python3 $TEST_SCRIPT -path $PLANS_DIR/multi_node_transfer_pkt.json --size 1M --in_place + mpirun ${MPI_ARGS} -np 2 -npernode 1 ${MSCCLPP_ENV} ${PYTHON_BIN} $TEST_SCRIPT -path $PLANS_DIR/multi_node_transfer.json --size 1M --in_place + mpirun ${MPI_ARGS} -np 2 -npernode 1 ${MSCCLPP_ENV} ${PYTHON_BIN} $TEST_SCRIPT -path $PLANS_DIR/multi_node_transfer_pkt.json --size 1M --in_place - template: templates/stop.yml parameters: From 1a6634315e01e4e2375b7675ec43cd92b7d7803e Mon Sep 17 00:00:00 2001 From: Caio Rocha Date: Tue, 26 May 2026 22:26:33 +0000 Subject: [PATCH 05/11] hardcoding hostfile --- .azure-pipelines/multi-nodes-test.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.azure-pipelines/multi-nodes-test.yml b/.azure-pipelines/multi-nodes-test.yml index cf23daaf6..f228b8f23 100644 --- a/.azure-pipelines/multi-nodes-test.yml +++ b/.azure-pipelines/multi-nodes-test.yml @@ -123,7 +123,8 @@ jobs: displayName: Run multi-nodes executor tests runRemoteArgs: '--hostfile $(System.DefaultWorkingDirectory)/test/deploy/hostfile --host ${{ parameters.vmssName }}000000 --user azureuser' remoteScript: | - HOSTFILE=/root/mscclpp/test/deploy/hostfile_mpi + HOSTFILE=/tmp/hostfile_mpi_executor + printf '%s\n%s\n' '10.0.0.5' '10.0.0.4' > ${HOSTFILE} PLANS_DIR=/root/mscclpp/test/executor-tests/execution-plans TEST_SCRIPT=/root/mscclpp/python/test/executor_test.py PYTHON_BIN=/root/venv/bin/python3 From 5b5cb50e2809fb2bf30476734144043d2ac43614 Mon Sep 17 00:00:00 2001 From: Caio Rocha Date: Tue, 26 May 2026 22:57:49 +0000 Subject: [PATCH 06/11] wip --- .azure-pipelines/multi-nodes-test.yml | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/.azure-pipelines/multi-nodes-test.yml b/.azure-pipelines/multi-nodes-test.yml index f228b8f23..5cdb2b795 100644 --- a/.azure-pipelines/multi-nodes-test.yml +++ b/.azure-pipelines/multi-nodes-test.yml @@ -123,12 +123,10 @@ jobs: displayName: Run multi-nodes executor tests runRemoteArgs: '--hostfile $(System.DefaultWorkingDirectory)/test/deploy/hostfile --host ${{ parameters.vmssName }}000000 --user azureuser' remoteScript: | - HOSTFILE=/tmp/hostfile_mpi_executor - printf '%s\n%s\n' '10.0.0.5' '10.0.0.4' > ${HOSTFILE} PLANS_DIR=/root/mscclpp/test/executor-tests/execution-plans TEST_SCRIPT=/root/mscclpp/python/test/executor_test.py PYTHON_BIN=/root/venv/bin/python3 - MPI_ARGS="--allow-run-as-root --bind-to numa -hostfile ${HOSTFILE} -mca btl_tcp_if_include eth0" + MPI_ARGS="--allow-run-as-root --bind-to numa -host 10.0.0.5,10.0.0.4 -mca btl_tcp_if_include eth0" MSCCLPP_ENV="-x MSCCLPP_DEBUG=WARN -x MSCCLPP_SOCKET_IFNAME=eth0 -x LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH" mpirun ${MPI_ARGS} -np 2 -npernode 1 ${MSCCLPP_ENV} ${PYTHON_BIN} $TEST_SCRIPT -path $PLANS_DIR/multi_node_transfer.json --size 1M --in_place mpirun ${MPI_ARGS} -np 2 -npernode 1 ${MSCCLPP_ENV} ${PYTHON_BIN} $TEST_SCRIPT -path $PLANS_DIR/multi_node_transfer_pkt.json --size 1M --in_place From 368a17f45d7cd2e3b12ddefcb687e3b3d7bdbdd7 Mon Sep 17 00:00:00 2001 From: Caio Rocha Date: Wed, 27 May 2026 00:11:22 +0000 Subject: [PATCH 07/11] wip --- .azure-pipelines/multi-nodes-test.yml | 8 +------- test/deploy/run_tests.sh | 16 +++++++++++++++- 2 files changed, 16 insertions(+), 8 deletions(-) diff --git a/.azure-pipelines/multi-nodes-test.yml b/.azure-pipelines/multi-nodes-test.yml index 5cdb2b795..b02f65b58 100644 --- a/.azure-pipelines/multi-nodes-test.yml +++ b/.azure-pipelines/multi-nodes-test.yml @@ -123,13 +123,7 @@ jobs: displayName: Run multi-nodes executor tests runRemoteArgs: '--hostfile $(System.DefaultWorkingDirectory)/test/deploy/hostfile --host ${{ parameters.vmssName }}000000 --user azureuser' remoteScript: | - PLANS_DIR=/root/mscclpp/test/executor-tests/execution-plans - TEST_SCRIPT=/root/mscclpp/python/test/executor_test.py - PYTHON_BIN=/root/venv/bin/python3 - MPI_ARGS="--allow-run-as-root --bind-to numa -host 10.0.0.5,10.0.0.4 -mca btl_tcp_if_include eth0" - MSCCLPP_ENV="-x MSCCLPP_DEBUG=WARN -x MSCCLPP_SOCKET_IFNAME=eth0 -x LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH" - mpirun ${MPI_ARGS} -np 2 -npernode 1 ${MSCCLPP_ENV} ${PYTHON_BIN} $TEST_SCRIPT -path $PLANS_DIR/multi_node_transfer.json --size 1M --in_place - mpirun ${MPI_ARGS} -np 2 -npernode 1 ${MSCCLPP_ENV} ${PYTHON_BIN} $TEST_SCRIPT -path $PLANS_DIR/multi_node_transfer_pkt.json --size 1M --in_place + bash /root/mscclpp/test/deploy/run_tests.sh executor-tests - template: templates/stop.yml parameters: diff --git a/test/deploy/run_tests.sh b/test/deploy/run_tests.sh index 6a70c76e7..ba86595b7 100644 --- a/test/deploy/run_tests.sh +++ b/test/deploy/run_tests.sh @@ -96,8 +96,18 @@ function run_py_benchmark() -x MSCCLPP_HOME=/root/mscclpp -npernode 8 python3 /root/mscclpp/python/mscclpp_benchmark/allreduce_bench.py } +function run_executor_tests() +{ + echo "==================Run multi-node executor tests======================" + PLANS_DIR=/root/mscclpp/test/executor-tests/execution-plans + TEST_SCRIPT=/root/mscclpp/python/test/executor_test.py + PYTHON_BIN=/root/venv/bin/python3 + mpirun ${MPI_ARGS} -np 2 -npernode 1 ${MSCCLPP_ENV} ${PYTHON_BIN} $TEST_SCRIPT -path $PLANS_DIR/multi_node_transfer.json --size 1M --in_place + mpirun ${MPI_ARGS} -np 2 -npernode 1 ${MSCCLPP_ENV} ${PYTHON_BIN} $TEST_SCRIPT -path $PLANS_DIR/multi_node_transfer_pkt.json --size 1M --in_place +} + if [ $# -lt 1 ]; then - echo "Usage: $0 " + echo "Usage: $0 " exit 1 fi test_name=$1 @@ -118,6 +128,10 @@ case $test_name in echo "==================Run python benchmark================================" run_py_benchmark ;; + executor-tests) + echo "==================Run executor tests=================================" + run_executor_tests + ;; *) echo "Unknown test name: $test_name" exit 1 From 67288e6c33dc73d8fc4f9707904ee313dba9d3d0 Mon Sep 17 00:00:00 2001 From: Caio Rocha Date: Wed, 27 May 2026 00:46:42 +0000 Subject: [PATCH 08/11] wip --- .azure-pipelines/multi-nodes-test.yml | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/.azure-pipelines/multi-nodes-test.yml b/.azure-pipelines/multi-nodes-test.yml index b02f65b58..2748813d7 100644 --- a/.azure-pipelines/multi-nodes-test.yml +++ b/.azure-pipelines/multi-nodes-test.yml @@ -84,14 +84,14 @@ jobs: resourceGroup: mscclpp gpuArch: '90' - # - template: templates/run-remote-task.yml - # parameters: - # name: RunMscclppTest - # displayName: Run multi-nodes mscclpp-test - # continueOnError: true - # runRemoteArgs: '--hostfile $(System.DefaultWorkingDirectory)/test/deploy/hostfile --host ${{ parameters.vmssName }}000000 --user azureuser' - # remoteScript: | - # bash /root/mscclpp/test/deploy/run_tests.sh mscclpp-test + - template: templates/run-remote-task.yml + parameters: + name: RunMscclppTest + displayName: Run multi-nodes mscclpp-test + continueOnError: true + runRemoteArgs: '--hostfile $(System.DefaultWorkingDirectory)/test/deploy/hostfile --host ${{ parameters.vmssName }}000000 --user azureuser' + remoteScript: | + bash /root/mscclpp/test/deploy/run_tests.sh mscclpp-test # - template: templates/run-remote-task.yml # parameters: From e6b002d7b16336746136a9e4b2ed27c5ac7d0f01 Mon Sep 17 00:00:00 2001 From: Caio Rocha Date: Wed, 27 May 2026 02:30:52 +0000 Subject: [PATCH 09/11] wip --- test/deploy/run_tests.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/deploy/run_tests.sh b/test/deploy/run_tests.sh index ba86595b7..f586fb256 100644 --- a/test/deploy/run_tests.sh +++ b/test/deploy/run_tests.sh @@ -101,9 +101,9 @@ function run_executor_tests() echo "==================Run multi-node executor tests======================" PLANS_DIR=/root/mscclpp/test/executor-tests/execution-plans TEST_SCRIPT=/root/mscclpp/python/test/executor_test.py - PYTHON_BIN=/root/venv/bin/python3 + PYTHON_BIN=python3 mpirun ${MPI_ARGS} -np 2 -npernode 1 ${MSCCLPP_ENV} ${PYTHON_BIN} $TEST_SCRIPT -path $PLANS_DIR/multi_node_transfer.json --size 1M --in_place - mpirun ${MPI_ARGS} -np 2 -npernode 1 ${MSCCLPP_ENV} ${PYTHON_BIN} $TEST_SCRIPT -path $PLANS_DIR/multi_node_transfer_pkt.json --size 1M --in_place + #mpirun ${MPI_ARGS} -np 2 -npernode 1 ${MSCCLPP_ENV} ${PYTHON_BIN} $TEST_SCRIPT -path $PLANS_DIR/multi_node_transfer_pkt.json --size 1M --in_place } if [ $# -lt 1 ]; then From 95166a2ec51f0a355d25df1b0d20aa7876744bb8 Mon Sep 17 00:00:00 2001 From: Caio Rocha Date: Wed, 27 May 2026 21:01:37 +0000 Subject: [PATCH 10/11] wip --- .../test.json} | 2 +- .../execution-plans/multi_node_transfer.json | 369 ------------------ 2 files changed, 1 insertion(+), 370 deletions(-) rename test/executor-tests/{execution-plans/multi_node_transfer_pkt.json => algos/test.json} (99%) delete mode 100644 test/executor-tests/execution-plans/multi_node_transfer.json diff --git a/test/executor-tests/execution-plans/multi_node_transfer_pkt.json b/test/executor-tests/algos/test.json similarity index 99% rename from test/executor-tests/execution-plans/multi_node_transfer_pkt.json rename to test/executor-tests/algos/test.json index 677d3dfc8..2b7195bac 100644 --- a/test/executor-tests/execution-plans/multi_node_transfer_pkt.json +++ b/test/executor-tests/algos/test.json @@ -1,5 +1,5 @@ { - "name": "multi_node_transfer", + "name": "allgather", "collective": "allgather", "protocol": "LL", "inplace": true, diff --git a/test/executor-tests/execution-plans/multi_node_transfer.json b/test/executor-tests/execution-plans/multi_node_transfer.json deleted file mode 100644 index a08ec870e..000000000 --- a/test/executor-tests/execution-plans/multi_node_transfer.json +++ /dev/null @@ -1,369 +0,0 @@ -{ - "name": "multi_node_transfer", - "collective": "allgather", - "protocol": "Simple", - "inplace": true, - "reuse_resources": false, - "gpus": [ - { - "id": 0, - "input_chunks": 2, - "output_chunks": 4, - "scratch_chunks": 0, - "threadblocks": [ - { - "id": 0, - "ops": [ - { - "name": "nop" - }, - { - "name": "signal", - "channel_ids": [ - 0 - ], - "channel_type": "port" - }, - { - "name": "nop" - }, - { - "name": "wait", - "channel_ids": [ - 0 - ], - "channel_type": "port" - }, - { - "name": "nop" - }, - { - "name": "put", - "src_buff": [ - { - "type": "o", - "index": 0, - "size": 1 - } - ], - "dst_buff": [ - { - "buffer_id": 0, - "index": 0, - "size": 1 - } - ], - "channel_ids": [ - 0 - ], - "channel_type": "port" - }, - { - "name": "nop" - }, - { - "name": "signal", - "channel_ids": [ - 0 - ], - "channel_type": "port" - }, - { - "name": "nop" - }, - { - "name": "flush", - "channel_ids": [ - 0 - ], - "channel_type": "port" - }, - { - "name": "nop" - }, - { - "name": "wait", - "channel_ids": [ - 0 - ], - "channel_type": "port" - }, - { - "name": "nop" - } - ], - "channels": [ - { - "channel_type": "port", - "channel_ids": [ - 0 - ] - } - ], - "remote_buffer_refs": [ - { - "access_channel_type": "port", - "remote_buffer_ids": [ - 0 - ] - } - ] - }, - { - "id": 1, - "ops": [ - { - "name": "nop" - }, - { - "name": "signal", - "channel_ids": [ - 0 - ], - "channel_type": "port" - }, - { - "name": "nop" - }, - { - "name": "wait", - "channel_ids": [ - 0 - ], - "channel_type": "port" - }, - { - "name": "nop" - }, - { - "name": "pws", - "src_buff": [ - { - "type": "o", - "index": 1, - "size": 1 - } - ], - "dst_buff": [ - { - "buffer_id": 0, - "index": 1, - "size": 1 - } - ], - "channel_ids": [ - 0 - ], - "channel_type": "port" - }, - { - "name": "nop" - }, - { - "name": "flush", - "channel_ids": [ - 0 - ], - "channel_type": "port" - }, - { - "name": "nop" - } - ], - "channels": [ - { - "channel_type": "port", - "channel_ids": [ - 1 - ] - } - ], - "remote_buffer_refs": [ - { - "access_channel_type": "port", - "remote_buffer_ids": [ - 0 - ] - } - ] - } - ], - "channels": [ - { - "channel_type": "port", - "connected_to": [ - 1, - 1 - ] - } - ], - "remote_buffers": [ - { - "rank": 1, - "type": "o", - "access_channel_types": [ - "port" - ] - } - ], - "semaphores": [] - }, - { - "id": 1, - "input_chunks": 2, - "output_chunks": 4, - "scratch_chunks": 0, - "threadblocks": [ - { - "id": 0, - "ops": [ - { - "name": "nop" - }, - { - "name": "signal", - "channel_ids": [ - 0 - ], - "channel_type": "port" - }, - { - "name": "nop" - }, - { - "name": "wait", - "channel_ids": [ - 0 - ], - "channel_type": "port" - }, - { - "name": "nop" - }, - { - "name": "pwsf", - "src_buff": [ - { - "type": "o", - "index": 2, - "size": 2 - } - ], - "dst_buff": [ - { - "buffer_id": 0, - "index": 2, - "size": 2 - } - ], - "channel_ids": [ - 0 - ], - "channel_type": "port" - }, - { - "name": "nop" - }, - { - "name": "wait", - "channel_ids": [ - 0 - ], - "channel_type": "port" - }, - { - "name": "nop" - } - ], - "channels": [ - { - "channel_type": "port", - "channel_ids": [ - 0 - ] - } - ], - "remote_buffer_refs": [ - { - "access_channel_type": "port", - "remote_buffer_ids": [ - 0 - ] - } - ] - }, - { - "id": 1, - "ops": [ - { - "name": "nop" - }, - { - "name": "signal", - "channel_ids": [ - 0 - ], - "channel_type": "port" - }, - { - "name": "nop" - }, - { - "name": "wait", - "channel_ids": [ - 0 - ], - "channel_type": "port" - }, - { - "name": "nop" - }, - { - "name": "wait", - "channel_ids": [ - 0 - ], - "channel_type": "port" - }, - { - "name": "nop" - } - ], - "channels": [ - { - "channel_type": "port", - "channel_ids": [ - 1 - ] - } - ], - "remote_buffer_refs": [] - } - ], - "channels": [ - { - "channel_type": "port", - "connected_to": [ - 0, - 0 - ] - } - ], - "remote_buffers": [ - { - "rank": 0, - "type": "o", - "access_channel_types": [ - "port" - ] - } - ], - "semaphores": [] - } - ], - "num_threads_per_block": 1024, - "use_double_scratch_buffer": false, - "buffer_alignment": 16, - "min_message_size": 0, - "max_message_size": 18446744073709551615 -} From 19bc5a29ec5f8bc28abb0f7f377732a95b91086b Mon Sep 17 00:00:00 2001 From: Caio Rocha Date: Wed, 27 May 2026 21:51:31 +0000 Subject: [PATCH 11/11] wip --- test/deploy/run_tests.sh | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/test/deploy/run_tests.sh b/test/deploy/run_tests.sh index f586fb256..dbac9cbfe 100644 --- a/test/deploy/run_tests.sh +++ b/test/deploy/run_tests.sh @@ -99,11 +99,18 @@ function run_py_benchmark() function run_executor_tests() { echo "==================Run multi-node executor tests======================" + ALGOS_DIR=/root/mscclpp/test/executor-tests/algos PLANS_DIR=/root/mscclpp/test/executor-tests/execution-plans TEST_SCRIPT=/root/mscclpp/python/test/executor_test.py - PYTHON_BIN=python3 + PYTHON_BIN=/root/venv/bin/python3 + + echo "Generating execution plans" + ${PYTHON_BIN} ${ALGOS_DIR}/multi_node_transfer.py --name multi_node_transfer > ${PLANS_DIR}/multi_node_transfer.json + ${PYTHON_BIN} ${ALGOS_DIR}/multi_node_transfer_pkt.py --name multi_node_transfer_pkt > ${PLANS_DIR}/multi_node_transfer_pkt.json + + echo "Running multi-node transfer test with in-place buffers" mpirun ${MPI_ARGS} -np 2 -npernode 1 ${MSCCLPP_ENV} ${PYTHON_BIN} $TEST_SCRIPT -path $PLANS_DIR/multi_node_transfer.json --size 1M --in_place - #mpirun ${MPI_ARGS} -np 2 -npernode 1 ${MSCCLPP_ENV} ${PYTHON_BIN} $TEST_SCRIPT -path $PLANS_DIR/multi_node_transfer_pkt.json --size 1M --in_place + mpirun ${MPI_ARGS} -np 2 -npernode 1 ${MSCCLPP_ENV} ${PYTHON_BIN} $TEST_SCRIPT -path $PLANS_DIR/multi_node_transfer_pkt.json --size 1M --in_place } if [ $# -lt 1 ]; then