You can use SDK for Python to submit deep learning jobs that use public resource groups. This topic describes how to download and install SDK for Python and use SDK for Python to submit deep learning jobs that use public resource groups.

Prerequisites

  • The public resource group of the general training resources is activated. For more information, see "Prepare a cluster of the public resource group type" section in the Prepare a public resource group.
  • The required service-linked role is assigned to DLC. For more information, see Grant the permissions that are required to use DLC.
  • The credentials for identity authentication are configured. You can use environment variables or local files to configure credentials. For more information, see Configure credentials.

Background information

For more information about how to use the console to submit public resource groups of the general training resources, see Submit jobs by using the console.

Step 1: Install the SDK for Python

  • Install the workspace SDK.
    pip install https://sdk-portal-us-prod.oss-accelerate.aliyuncs.com/downloads/u-a397e06e-edea-4756-ad50-fe7c6f7b0bf4-python-aiworkspace.zip
  • Install the DLC SDK.
    # Install python2 sdk. 
    pip install https://sdk-portal-us-prod.oss-accelerate.aliyuncs.com/downloads/u-b7e79745-b9b1-4060-946c-05b1dca491bc-python2-pai-dlc.zip
    # Install python3 sdk. 
    pip install https://sdk-portal-us-prod.oss-accelerate.aliyuncs.com/downloads/u-b7e79745-b9b1-4060-946c-05b1dca491bc-python-pai-dlc.zip

Step 2: Create and submit a deep learning job

Use the public resource group to create and submit a deep learning job

You can use the following sample code to create and submit a deep learning job:
#!/usr/bin/env python3

from __future__ import print_function

import json
import time

from alibabacloud_tea_openapi.models import Config
from alibabacloud_credentials.client import Client as CredClient
from alibabacloud_pai_dlc20201203.client import Client as DLCClient
from alibabacloud_pai_dlc20201203.models import (
    ListJobsRequest,
    ListEcsSpecsRequest,
    CreateJobRequest
)

from alibabacloud_aiworkspace20210204.client import Client as AIWorkspaceClient
from alibabacloud_aiworkspace20210204.models import (
    ListWorkspacesRequest,
    CreateDatasetRequest,
    ListDatasetsRequest,
    ListImagesRequest,
    ListCodeSourcesRequest
)


def create_nas_dataset(client, region, workspace_id, name,
                       nas_id, nas_path, mount_path):
    '''Create a NAS dataset. 
    '''
    response = client.create_dataset(CreateDatasetRequest(
        workspace_id=workspace_id,
        name=name,
        data_type='COMMON',
        data_source_type='NAS',
        property='DIRECTORY',
        uri=f'nas://{nas_id}.{region}{nas_path}',
        accessibility='PRIVATE',
        source_type='USER',
        options=json.dumps({
            'mountPath': mount_path
        })
    ))
    return response.body.dataset_id


def create_oss_dataset(client, region, workspace_id, name,
                       oss_bucket, oss_endpoint, oss_path, mount_path):
    '''Create an Object Storage Service (OSS) dataset. 
    '''
    response = client.create_dataset(CreateDatasetRequest(
        workspace_id=workspace_id,
        name=name,
        data_type='COMMON',
        data_source_type='OSS',
        property='DIRECTORY',
        uri=f'oss://{oss_bucket}.{oss_endpoint}{oss_path}',
        accessibility='PRIVATE',
        source_type='USER',
        options=json.dumps({
            'mountPath': mount_path
        })
    ))
    return response.body.dataset_id



def wait_for_job_to_terminate(client, job_id):
    while True:
        job = client.get_job(job_id).body
        print('job is {}'.format(job.status))
        if job.status in ('Succeeded', 'Failed', 'Stopped'):
            return job.status
        time.sleep(5)
    return None


def main():

    # Make sure that your Alibaba Cloud account is granted the required permissions on DLC. 
    region_id = 'cn-hangzhou'
    # Security risks may arise if you use the AccessKey pair of an Alibaba Cloud account because the account has permissions on all API operations. We recommend that you use a RAM user to call API operations or perform routine O&M. 
    # We do not recommend that you save the AccessKey ID and AccessKey secret in your project code. Otherwise, the AccessKey pair may be leaked and the security of your resources may be compromised. 
    # In this example, the Credentials SDK reads the AccessKey pair from the environment variable to implement identity verification. 
    cred = CredClient()

    # 1. create client;
    workspace_client = AIWorkspaceClient(
        config=Config(
            credential=cred,
            region_id=region_id,
            endpoint="aiworkspace.{}.aliyuncs.com".format(region_id),
        )
    )

    dlc_client = DLCClient(
         config=Config(
            credential=cred,
            region_id=region_id,
            endpoint='pai-dlc.{}.aliyuncs.com'.format(region_id),
         )
    )

    print('------- Workspaces -----------')
    # Obtain the workspace list. You can specify the name of the workspace that you created in the workspace_name parameter. 
    workspaces = workspace_client.list_workspaces(ListWorkspacesRequest(
        page_number=1, page_size=1, workspace_name='',
        module_list='PAI'
    ))
    for workspace in workspaces.body.workspaces:
        print(workspace.workspace_name, workspace.workspace_id,
              workspace.status, workspace.creator)

    if len(workspaces.body.workspaces) == 0:
        raise RuntimeError('found no workspaces')

    workspace_id = workspaces.body.workspaces[0].workspace_id

    print('------- Images ------------')
    # Obtain the image list. 
    images = workspace_client.list_images(ListImagesRequest(
        labels=','.join(['system.supported.dlc=true',
                         'system.framework=Tensorflow 1.15',
                         'system.pythonVersion=3.6',
                         'system.chipType=CPU'])))
    for image in images.body.images:
        print(json.dumps(image.to_map(), indent=2))

    image_uri = images.body.images[0].image_uri

    print('------- Datasets ----------')
    # Obtain the dataset. 
    datasets = workspace_client.list_datasets(ListDatasetsRequest(
        workspace_id=workspace_id,
        name='example-nas-data', properties='DIRECTORY'))
    for dataset in datasets.body.datasets:
        print(dataset.name, dataset.dataset_id, dataset.uri, dataset.options)

    if len(datasets.body.datasets) == 0:
        # Create a dataset when the specified dataset does not exist. 
        dataset_id = create_nas_dataset(
            client=workspace_client,
            region=region_id,
            workspace_id=workspace_id,
            name='example-nas-data',
            # The ID of the NAS file system. 
            # General-purpose NAS: 31a8e4****. 
            # Extreme NAS: The ID must start with extreme-. Example: extreme-0015****. 
            # CPFS: The ID must start with cpfs-. Example: cpfs-125487****. 
            nas_id='***',
            nas_path='/',
            mount_path='/mnt/data/nas')
        print('create dataset with id: {}'.format(dataset_id))
    else:
        dataset_id = datasets.body.datasets[0].dataset_id

    print('------- Code Sources ----------')
    # Obtain the source code file list. 
    code_sources = workspace_client.list_code_sources(ListCodeSourcesRequest(
        workspace_id=workspace_id))
    for code_source in code_sources.body.code_sources:
        print(code_source.display_name, code_source.code_source_id, code_source.code_repo)

    print('-------- ECS SPECS ----------')
    # Obtain the DLC node specification list. 
    ecs_specs = dlc_client.list_ecs_specs(ListEcsSpecsRequest(page_size=100, sort_by='Memory', order='asc'))
    for spec in ecs_specs.body.ecs_specs:
        print(spec.instance_type, spec.cpu, spec.memory, spec.memory, spec.gpu_type)

    print('-------- Create Job ----------')
    # Create a deep learning job. 
    create_job_resp = dlc_client.create_job(CreateJobRequest().from_map({
        'WorkspaceId': workspace_id,
        'DisplayName': 'sample-dlc-job',
        'JobType': 'TFJob',
        'JobSpecs': [
            {
                "Type": "Worker",
                "Image": image_uri,
                "PodCount": 1,
                "EcsSpec": ecs_specs.body.ecs_specs[0].instance_type,
                "UseSpotInstance": False,
            },
        ],
        "UserCommand": "echo 'Hello World' && ls -R /mnt/data/ && sleep 30 && echo 'DONE'",
        'DataSources': [
            {
                "DataSourceId": dataset_id,
            },
        ],
    }))
    job_id = create_job_resp.body.job_id

    wait_for_job_to_terminate(dlc_client, job_id)

    print('-------- List Jobs ----------')
    # Obtain the deep learning job list. 
    jobs = dlc_client.list_jobs(ListJobsRequest(
        workspace_id=workspace_id,
        page_number=1,
        page_size=10,
    ))
    for job in jobs.body.jobs:
        print(job.display_name, job.job_id, job.workspace_name,
              job.status, job.job_type)
    pass


if __name__ == '__main__':
    main()