-
Notifications
You must be signed in to change notification settings - Fork 70
/
Copy pathdistributed_job.json
26 lines (24 loc) · 1.32 KB
/
distributed_job.json
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
{
"jobName": "tensorflow-distributed-001",
"image": "10.11.3.8:5000/pai-images/pai.run.deepo:v1.1",
"taskRoles": [
{
"name": "ps",
"taskNumber": 2,
"cpuNumber": 2,
"memoryMB": 8192,
"gpuNumber": 0,
"command": "cd /userhome && wget -c https://raw.githubusercontent.com/Leinao/LeinaoPAI/master/example/tensorflow-distributed/mnist_replica.py && python /userhome/mnist_replica.py --num_gpus=0 --batch_size=32 --data_dir=/gdata/MNIST --train_dir=/userhome/tensorflow-distributed/output --ps_hosts=$PAI_TASK_ROLE_ps_HOST_LIST --worker_hosts=$PAI_TASK_ROLE_worker_HOST_LIST --job_name=ps --task_index=$PAI_CURRENT_TASK_ROLE_CURRENT_TASK_INDEX"
},
{
"name": "worker",
"taskNumber": 2,
"cpuNumber": 2,
"memoryMB": 16384,
"gpuNumber": 2,
"command": "cd /userhome && wget -c https://raw.githubusercontent.com/Leinao/LeinaoPAI/master/example/tensorflow-distributed/mnist_replica.py && python /userhome/mnist_replica.py --num_gpus=2 --batch_size=32 --data_dir=/gdata/MNIST --train_dir=/userhome/tensorflow-distributed/output --ps_hosts=$PAI_TASK_ROLE_ps_HOST_LIST --worker_hosts=$PAI_TASK_ROLE_worker_HOST_LIST --job_name=worker --task_index=$PAI_CURRENT_TASK_ROLE_CURRENT_TASK_INDEX"
}
],
"killAllOnCompletedTaskNumber": 2,
"retryCount": 0
}