Man Linux: Main Page and Category List

NAME

       slurm_step_launch_params_t_init,                     slurm_step_launch,
       slurm_step_launch_fwd_signal,             slurm_step_launch_wait_start,
       slurm_step_launch_wait_finish, slurm_step_launch_abort - Slurm job step
       launch functions

SYNTAX

       #include <slurm/slurm.h>

       void slurm_step_launch_params_t_init (
            slurm_step_launch_params_t *launch_req
       );

       int slurm_step_launch (
            slurm_step_ctx ctx,
            const slurm_step_launch_params_t *launch_req,
            const slurm_step_launch_callbacks_t callbacks
       );

       void slurm_step_launch_fwd_signal (
            slurm_step_ctx ctx,
            int signo
       );

       int slurm_step_launch_wait_start (
            slurm_step_ctx ctx
       );

       void slurm_step_launch_wait_finish (
            slurm_step_ctx ctx
       );

       void slurm_step_launch_abort {
            slurm_step_ctx ctx
       );

ARGUMENTS

       callbacks
              Identify functions to be called when various events occur.

       ctx    Job step context.  Created  by  slurm_step_ctx_create,  used  in
              subsequent      function     calls,     and     destroyed     by
              slurm_step_ctx_destroy.

       launch_req
              Pointer  to  a  structure  allocated  by  the  user   containing
              specifications of the job step to be launched.

DESCRIPTION

       slurm_step_launch_params_t_init     Iinitialize     a    user-allocated
       slurm_step_launch_params_t  structure  with  default  values.   default
       values.  This function will NOT allocate any new memory.

       slurm_step_launch Launch a parallel job step.

       slurm_step_launch_fwd_signal  Forward  a signal to all those nodes with
       running tasks.

       slurm_step_launch_wait_start Block until all tasks have started.

       slurm_step_launch_wait_finish Block until all tasks have  finished  (or
       failed to start altogether).

       slurm_step_launch_abort  Abort  an in-progress launch, or terminate the
       fully launched job step. Can be called from a signal handler.

IO Redirection

       Use the local_fds entry in  slurm_step_launch_params_t to specify  file
       descriptors  to  be  used  for  standard  input,  output and error. Any
       local_fds not specified will result in the  launched  tasks  using  the
       calling process’s standard input, output and error.  Threads created by
       slurm_step_launch will  completely  handle  copying  data  between  the
       remote processes and the specified local file descriptors.

       Use the substructure in slurm_step_io_fds_t to restrict the redirection
       of I/O to a specific node or task ID. For example, to redirect standard
       output only from task 0, set

       params.local_fs.out.taskid=0;

       Use  the remote_*_filename fields in slurm_step_launch_params_t to have
       launched tasks read and/or write directly to local  files  rather  than
       transferring  data  over  the  network  to  the calling process.  These
       strings support many of the same format options as  the  srun  command.
       Any  remote_*_filename  fields  set  will  supersede  the corresponding
       local_fds entries. For example, the following  code  will  direct  each
       task  to  write  standard output and standard error to local files with
       names containing the task ID (e.g.   "/home/bob/test_output/run1.out.0"
       and "/home/bob/test_output/run.1.err.0" for task 0).

       params.remote_output_filename = "/home/bob/test_output/run1.out.%t"
       params.remote_error_filename  = "/home/bob/test_output/run1.err.%t"

RETURN VALUE

       slurm_step_launch    and   slurm_step_launch_wait_start   will   return
       SLURM_SUCCESS when all tasks have successfully started, or  SLURM_ERROR
       if the job step is aborted during launch.

ERRORS

       EINVAL Invalid argument

       SLURM_PROTOCOL_VERSION_ERROR Protocol version has changed, re-link your
       code.

       ESLURM_INVALID_JOB_ID the requested job id does not exist.

       ESLURM_ALREADY_DONE the specified job has already completed and can not
       be modified.

       ESLURM_ACCESS_DENIED  the  requesting  user lacks authorization for the
       requested action (e.g. trying to delete or modify another user’s  job).

       ESLURM_INTERCONNECT_FAILURE  failed to configure the node interconnect.

       ESLURM_BAD_DIST task distribution specification is invalid.

       SLURM_PROTOCOL_SOCKET_IMPL_TIMEOUT Timeout in communicating with  SLURM
       controller.

EXAMPLE

       /*
        * To compile:
        * gcc test.c -o test -g -pthread -lslurm
        *
        * Or if Slurm is not in your default search paths:
        * gcc test.c -o test -g -pthread -I{$SLURM_DIR}/include \
        *     -Wl,--rpath={$SLURM_DIR}/lib -L{$SLURM_DIR}/lib -lslurm
        */
       #include <stdio.h>
       #include <stdlib.h>
       #include <string.h>
       #include <slurm/slurm.h>
       #include <slurm/slurm_errno.h>

       static void _task_start(launch_tasks_response_msg_t *msg)
       {
            printf("%d tasks started on node %s\n",
                 msg->count_of_pids, msg->node_name);
       }

       static void _task_finish(task_exit_msg_t *msg)
       {
            printf("%d tasks finished\n", msg->num_tasks);
       }

       int main (int argc, char *argv[])
       {
            slurm_step_ctx_params_t step_params;
            slurm_step_ctx step_ctx;
            slurm_step_launch_params_t params;
            slurm_step_launch_callbacks_t callbacks;
            uint32_t job_id, step_id;

            slurm_step_ctx_params_t_init(&step_params);
            step_params.node_count = 1;
            step_params.task_count = 4;
            step_params.overcommit = true;

            step_ctx = slurm_step_ctx_create(&step_params);
            if (step_ctx == NULL) {
                 slurm_perror("slurm_step_ctx_create");
                 exit(1);
            }
            slurm_step_ctx_get(step_ctx, SLURM_STEP_CTX_JOBID, &job_id);
            slurm_step_ctx_get(step_ctx, SLURM_STEP_CTX_STEPID, &step_id);
            printf("Ready to start job %u step %u\n", job_id, step_id);

            slurm_step_launch_params_t_init(&params);
            params.argc = argc - 1;
            params.argv = argv + 1;
            callbacks.task_start = _task_start;
            callbacks.task_finish = _task_finish;
            if (slurm_step_launch(step_ctx, NULL, &params, &callbacks)
                      != SLURM_SUCCESS) {
                 slurm_perror("slurm_step_launch");
                 exit(1);
            }
            printf("Sent step launch RPC\n");

            if (slurm_step_launch_wait_start(step_ctx) != SLURM_SUCCESS) {
                 fprintf(stderr, "job step was aborted during launch\n");
            } else {
                 printf("All tasks have started\n");
            }

            slurm_step_launch_wait_finish(step_ctx);
            printf("All tasks have finished\n");

            slurm_step_ctx_destroy(step_ctx);
            exit(0);
       }

NOTE

       These  functions  are  included  in the libslurm library, which must be
       linked to your process for use (e.g. "cc -lslurm myprog.c").

COPYING

       Copyright (C) 2006-2007 The Regents of the  University  of  California.
       Copyright  (C)  2008 Lawrence Livermore National Security.  Produced at
       Lawrence    Livermore    National    Laboratory    (cf,    DISCLAIMER).
       CODE-OCEC-09-009. All rights reserved.

       This  file  is  part  of  SLURM,  a  resource  management program.  For
       details, see <https://computing.llnl.gov/linux/slurm/>.

       SLURM is free software; you can redistribute it and/or modify it  under
       the  terms  of  the GNU General Public License as published by the Free
       Software Foundation; either version 2  of  the  License,  or  (at  your
       option) any later version.

       SLURM  is  distributed  in the hope that it will be useful, but WITHOUT
       ANY WARRANTY; without even the implied warranty of  MERCHANTABILITY  or
       FITNESS  FOR  A PARTICULAR PURPOSE.  See the GNU General Public License
       for more details.

SEE ALSO

       slurm_step_ctx_create(3),                    slurm_step_ctx_destroy(3),
       slurm_get_errno(3),   slurm_perror(3),   slurm_strerror(3),  salloc(1),
       srun(1)