pypi package 'trainml'

Popularity: Low
Description: trainML client SDK and command line utilities
Installation: pip install trainml
Last version: 0.4.10 (Download)
Homepage: https://github.com/trainML/trainml-cli
Size: 82.11 kB
License: MIT

Activity

Last modified: January 25, 2023 2:38 AM (10 months ago)
Versions released in one year: 4
Weekly downloads: 11
12/11/202202/26/202305/14/202308/06/202310/29/2023025507510001234released versions / week
  • Versions released
  • Weekly downloads

What's new in version 0.4.10

Delta between version 0.4.9 and version 0.4.10

Source: Github
Commits:
  • 37fee506cc71df4ac652a705df0f1e0249e691da, January 24, 2023 11:14 PM:
    adding rename functions and save_model option
  • 355170a065aa21075e869720d352598e12126959, January 25, 2023 2:37 AM:
    incrementing version
Files changed:
examples/training_inference_pipeline.py CHANGED
@@ -30,8 +30,7 @@
30
  "python training/image-classification/resnet_cifar.py --epochs 10 --optimizer adam --batch-size 128",
31
  ],
32
  data=dict(
33
- datasets=[dataset.id],
34
- output_type="trainml",
35
  ),
36
  model=dict(
37
  source_type="git",
@@ -48,7 +47,7 @@
48
  training_job = asyncio.run(training_job.refresh())
49
 
50
  model = asyncio.run(
51
- trainml_client.models.get(training_job.workers[0].get("output_model_uuid"))
52
  )
53
 
54
  # Ensure the model is ready to use
30
  "python training/image-classification/resnet_cifar.py --epochs 10 --optimizer adam --batch-size 128",
31
  ],
32
  data=dict(
33
+ datasets=[dataset.id], output_type="trainml", output_uri="model"
 
34
  ),
35
  model=dict(
36
  source_type="git",
47
  training_job = asyncio.run(training_job.refresh())
48
 
49
  model = asyncio.run(
50
+ trainml_client.models.get(training_job.workers[0].get("output_uuid"))
51
  )
52
 
53
  # Ensure the model is ready to use
tests/integration/test_jobs_integration.py CHANGED
@@ -450,6 +450,7 @@ async def test_job_model_input_and_output(self, trainml, capsys):
450
  )
451
  ],
452
  output_type="trainml",
 
453
  ),
454
  model=dict(source_type="trainml", source_uri=model.id),
455
  )
@@ -465,9 +466,7 @@ async def test_job_model_input_and_output(self, trainml, capsys):
465
  assert "Epoch 1/2" in captured.out
466
  assert "Epoch 2/2" in captured.out
467
 
468
- new_model = await trainml.models.get(
469
- workers[0].get("output_model_uuid")
470
- )
471
  assert new_model.id
472
  await new_model.wait_for("ready")
473
  await new_model.refresh()
@@ -618,12 +617,7 @@ async def test_cpu_instance(self, trainml, capsys):
618
  "python $TRAINML_MODEL_PATH/pytorch/main.py",
619
  ],
620
  data=dict(
621
- datasets=[
622
- dict(
623
- id="CIFAR-10",
624
- public=True,
625
- )
626
- ],
627
  ),
628
  )
629
  assert job.id
450
  )
451
  ],
452
  output_type="trainml",
453
+ output_uri="model",
454
  ),
455
  model=dict(source_type="trainml", source_uri=model.id),
456
  )
466
  assert "Epoch 1/2" in captured.out
467
  assert "Epoch 2/2" in captured.out
468
 
469
+ new_model = await trainml.models.get(workers[0].get("output_uuid"))
 
 
470
  assert new_model.id
471
  await new_model.wait_for("ready")
472
  await new_model.refresh()
617
  "python $TRAINML_MODEL_PATH/pytorch/main.py",
618
  ],
619
  data=dict(
620
+ datasets=[dict(id="MNIST", public=True)],
 
 
 
 
 
621
  ),
622
  )
623
  assert job.id
trainml/__init__.py CHANGED
@@ -13,5 +13,5 @@
13
  logger = logging.getLogger(__name__)
14
 
15
 
16
- __version__ = "0.4.9"
17
  __all__ = "TrainML"
13
  logger = logging.getLogger(__name__)
14
 
15
 
16
+ __version__ = "0.4.10"
17
  __all__ = "TrainML"
trainml/checkpoints.py CHANGED
@@ -159,6 +159,14 @@ async def remove(self, force=False):
159
  dict(project_uuid=self._project_uuid, force=force),
160
  )
161
 
 
 
 
 
 
 
 
 
162
  def _get_msg_handler(self, msg_handler):
163
  def handler(data):
164
  if data.get("type") == "subscription":
159
  dict(project_uuid=self._project_uuid, force=force),
160
  )
161
 
162
+ async def rename(self, name):
163
+ await self.trainml._query(
164
+ f"/checkpoint/{self._id}",
165
+ "PATCH",
166
+ None,
167
+ dict(name=name),
168
+ )
169
+
170
  def _get_msg_handler(self, msg_handler):
171
  def handler(data):
172
  if data.get("type") == "subscription":
trainml/cli/checkpoint.py CHANGED
@@ -241,3 +241,25 @@ def remove(config, checkpoint, force):
241
  raise click.UsageError("Cannot find specified checkpoint.")
242
 
243
  return config.trainml.run(found.remove(force=force))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
241
  raise click.UsageError("Cannot find specified checkpoint.")
242
 
243
  return config.trainml.run(found.remove(force=force))
244
+
245
+
246
+ @checkpoint.command()
247
+ @click.argument("checkpoint", type=click.STRING)
248
+ @click.argument("name", type=click.STRING)
249
+ @pass_config
250
+ def rename(config, checkpoint, name):
251
+ """
252
+ Renames a checkpoint.
253
+
254
+ CHECKPOINT may be specified by name or ID, but ID is preferred.
255
+ """
256
+ try:
257
+ checkpoint = config.trainml.run(
258
+ config.trainml.client.checkpoints.get(checkpoint)
259
+ )
260
+ if checkpoint is None:
261
+ raise click.UsageError("Cannot find specified checkpoint.")
262
+ except:
263
+ raise click.UsageError("Cannot find specified checkpoint.")
264
+
265
+ return config.trainml.run(checkpoint.rename(name=name))
trainml/cli/dataset.py CHANGED
@@ -239,3 +239,25 @@ def remove(config, dataset, force):
239
  raise click.UsageError("Cannot find specified dataset.")
240
 
241
  return config.trainml.run(found.remove(force=force))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
239
  raise click.UsageError("Cannot find specified dataset.")
240
 
241
  return config.trainml.run(found.remove(force=force))
242
+
243
+
244
+ @dataset.command()
245
+ @click.argument("dataset", type=click.STRING)
246
+ @click.argument("name", type=click.STRING)
247
+ @pass_config
248
+ def rename(config, dataset, name):
249
+ """
250
+ Renames a dataset.
251
+
252
+ DATASET may be specified by name or ID, but ID is preferred.
253
+ """
254
+ try:
255
+ dataset = config.trainml.run(
256
+ config.trainml.client.datasets.get(dataset)
257
+ )
258
+ if dataset is None:
259
+ raise click.UsageError("Cannot find specified dataset.")
260
+ except:
261
+ raise click.UsageError("Cannot find specified dataset.")
262
+
263
+ return config.trainml.run(dataset.rename(name=name))
trainml/cli/job/create.py CHANGED
@@ -183,7 +183,7 @@ def create(config):
183
  show_default=True,
184
  help="Seconds to wait for job to start",
185
  )
186
- @click.argument("name", type=click.STRING)
187
  @pass_config
188
  def notebook(
189
  config,
@@ -436,6 +436,12 @@ def notebook(
436
  show_default=True,
437
  help="Zip the output contents before uploading.",
438
  )
 
 
 
 
 
 
439
  @click.option(
440
  "--environment",
441
  type=click.Choice(
@@ -509,12 +515,8 @@ def notebook(
509
  type=click.Path(exists=True, file_okay=False, resolve_path=True),
510
  help="Local file path to copy as the model data",
511
  )
512
- @click.argument("name", type=click.STRING)
513
- @click.argument(
514
- "commands",
515
- type=click.STRING,
516
- nargs=-1,
517
- )
518
  @pass_config
519
  def training(
520
  config,
@@ -533,6 +535,7 @@ def training(
533
  output_type,
534
  output_uri,
535
  archive,
 
536
  environment,
537
  custom_image,
538
  env,
@@ -581,14 +584,16 @@ def training(
581
  if output_type:
582
  options["data"]["output_type"] = output_type
583
  options["data"]["output_uri"] = output_uri
584
- if not archive:
585
- options["data"]["output_options"] = dict(archive=False)
 
586
 
587
  if output_dir:
588
  options["data"]["output_type"] = "local"
589
  options["data"]["output_uri"] = output_dir
590
- if not archive:
591
- options["data"]["output_options"] = dict(archive=False)
 
592
 
593
  try:
594
  envs = [
@@ -775,6 +780,12 @@ def training(
775
  show_default=True,
776
  help="Zip the output contents before uploading.",
777
  )
 
 
 
 
 
 
778
  @click.option(
779
  "--environment",
780
  type=click.Choice(
@@ -848,11 +859,8 @@ def training(
848
  type=click.Path(exists=True, file_okay=False, resolve_path=True),
849
  help="Local file path to copy as the model data",
850
  )
851
- @click.argument("name", type=click.STRING)
852
- @click.argument(
853
- "command",
854
- type=click.STRING,
855
- )
856
  @pass_config
857
  def inference(
858
  config,
@@ -871,6 +879,7 @@ def inference(
871
  output_type,
872
  output_uri,
873
  archive,
 
874
  environment,
875
  custom_image,
876
  env,
@@ -920,14 +929,16 @@ def inference(
920
  if output_type:
921
  options["data"]["output_type"] = output_type
922
  options["data"]["output_uri"] = output_uri
923
- if not archive:
924
- options["data"]["output_options"] = dict(archive=False)
 
925
 
926
  if output_dir:
927
  options["data"]["output_type"] = "local"
928
  options["data"]["output_uri"] = output_dir
929
- if not archive:
930
- options["data"]["output_options"] = dict(archive=False)
 
931
 
932
  try:
933
  envs = [
@@ -1176,7 +1187,7 @@ def from_json(config, attach, connect, file):
1176
  show_default=True,
1177
  help="Seconds to wait for job to start",
1178
  )
1179
- @click.argument("name", type=click.STRING)
1180
  @pass_config
1181
  def endpoint(
1182
  config,
183
  show_default=True,
184
  help="Seconds to wait for job to start",
185
  )
186
+ @click.argument("name", type=click.STRING, required=True)
187
  @pass_config
188
  def notebook(
189
  config,
436
  show_default=True,
437
  help="Zip the output contents before uploading.",
438
  )
439
+ @click.option(
440
+ "--save-model/--no-save-model",
441
+ default=False,
442
+ show_default=True,
443
+ help="Include the model directory in the job output",
444
+ )
445
  @click.option(
446
  "--environment",
447
  type=click.Choice(
515
  type=click.Path(exists=True, file_okay=False, resolve_path=True),
516
  help="Local file path to copy as the model data",
517
  )
518
+ @click.argument("name", type=click.STRING, required=True)
519
+ @click.argument("commands", type=click.STRING, nargs=-1, required=True)
 
 
 
 
520
  @pass_config
521
  def training(
522
  config,
535
  output_type,
536
  output_uri,
537
  archive,
538
+ save_model,
539
  environment,
540
  custom_image,
541
  env,
584
  if output_type:
585
  options["data"]["output_type"] = output_type
586
  options["data"]["output_uri"] = output_uri
587
+ options["data"]["output_options"] = dict(
588
+ archive=archive, save_model=save_model
589
+ )
590
 
591
  if output_dir:
592
  options["data"]["output_type"] = "local"
593
  options["data"]["output_uri"] = output_dir
594
+ options["data"]["output_options"] = dict(
595
+ archive=archive, save_model=save_model
596
+ )
597
 
598
  try:
599
  envs = [
780
  show_default=True,
781
  help="Zip the output contents before uploading.",
782
  )
783
+ @click.option(
784
+ "--save-model/--no-save-model",
785
+ default=False,
786
+ show_default=True,
787
+ help="Include the model directory in the job output",
788
+ )
789
  @click.option(
790
  "--environment",
791
  type=click.Choice(
859
  type=click.Path(exists=True, file_okay=False, resolve_path=True),
860
  help="Local file path to copy as the model data",
861
  )
862
+ @click.argument("name", type=click.STRING, required=True)
863
+ @click.argument("command", type=click.STRING, required=True)
 
 
 
864
  @pass_config
865
  def inference(
866
  config,
879
  output_type,
880
  output_uri,
881
  archive,
882
+ save_model,
883
  environment,
884
  custom_image,
885
  env,
929
  if output_type:
930
  options["data"]["output_type"] = output_type
931
  options["data"]["output_uri"] = output_uri
932
+ options["data"]["output_options"] = dict(
933
+ archive=archive, save_model=save_model
934
+ )
935
 
936
  if output_dir:
937
  options["data"]["output_type"] = "local"
938
  options["data"]["output_uri"] = output_dir
939
+ options["data"]["output_options"] = dict(
940
+ archive=archive, save_model=save_model
941
+ )
942
 
943
  try:
944
  envs = [
1187
  show_default=True,
1188
  help="Seconds to wait for job to start",
1189
  )
1190
+ @click.argument("name", type=click.STRING, required=True)
1191
  @pass_config
1192
  def endpoint(
1193
  config,
trainml/cli/model.py CHANGED
@@ -212,3 +212,23 @@ def remove(config, model, force):
212
  raise click.UsageError("Cannot find specified model.")
213
 
214
  return config.trainml.run(found.remove(force=force))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
212
  raise click.UsageError("Cannot find specified model.")
213
 
214
  return config.trainml.run(found.remove(force=force))
215
+
216
+
217
+ @model.command()
218
+ @click.argument("model", type=click.STRING)
219
+ @click.argument("name", type=click.STRING)
220
+ @pass_config
221
+ def rename(config, model, name):
222
+ """
223
+ Renames a model.
224
+
225
+ MODEL may be specified by name or ID, but ID is preferred.
226
+ """
227
+ try:
228
+ model = config.trainml.run(config.trainml.client.models.get(model))
229
+ if model is None:
230
+ raise click.UsageError("Cannot find specified model.")
231
+ except:
232
+ raise click.UsageError("Cannot find specified model.")
233
+
234
+ return config.trainml.run(model.rename(name=name))
trainml/datasets.py CHANGED
@@ -153,6 +153,14 @@ async def remove(self, force=False):
153
  dict(project_uuid=self._project_uuid, force=force),
154
  )
155
 
 
 
 
 
 
 
 
 
156
  def _get_msg_handler(self, msg_handler):
157
  def handler(data):
158
  if data.get("type") == "subscription":
153
  dict(project_uuid=self._project_uuid, force=force),
154
  )
155
 
156
+ async def rename(self, name):
157
+ await self.trainml._query(
158
+ f"/dataset/{self._id}",
159
+ "PATCH",
160
+ None,
161
+ dict(name=name),
162
+ )
163
+
164
  def _get_msg_handler(self, msg_handler):
165
  def handler(data):
166
  if data.get("type") == "subscription":
trainml/models.py CHANGED
@@ -146,6 +146,14 @@ async def remove(self, force=False):
146
  dict(project_uuid=self._project_uuid, force=force),
147
  )
148
 
 
 
 
 
 
 
 
 
149
  def _get_msg_handler(self, msg_handler):
150
  def handler(data):
151
  if data.get("type") == "subscription":
146
  dict(project_uuid=self._project_uuid, force=force),
147
  )
148
 
149
+ async def rename(self, name):
150
+ await self.trainml._query(
151
+ f"/model/{self._id}",
152
+ "PATCH",
153
+ None,
154
+ dict(name=name),
155
+ )
156
+
157
  def _get_msg_handler(self, msg_handler):
158
  def handler(data):
159
  if data.get("type") == "subscription":

Readme

<div align="center"> <a href="https://www.trainml.ai/"><img src="https://www.trainml.ai/static/img/trainML-logo-purple.png"></a><br> </div>

trainML Python SDK and Command Line Tools

Provides programmatic access to trainML platform.

Installation

Python 3.8 or above is required.

pip install trainml

Authentication

Prerequisites

You must have a valid trainML account. On the account settings page click the Create button in the API Keys section. This will automatically download a credentials.json file. This file can only be generated once per API key. Treat this file as a password, as anyone with access to your API key will have the ability to create and control resources in your trainML account. You can deactivate any API key by clicking the Remove button.

Creating resources on the trainML platform requires a non-zero credit balance. To purchase credits or sign-up for automatic credit top-ups, visit the billing page.

Methods

Credentials File

The easiest way to authenticate is to place the credentials file downloaded into the .trainml folder of your home directory and ensure only you have access to it. From the directory that the credentials.json file was downloaded, run the following command:

mkdir -p ~/.trainml
mv credentials.json ~/.trainml/credentials.json
chmod 600 ~/.trainml/credentials.json

Environment Variables

You can also use environment variables TRAINML_USER and TRAINML_KEY and set them to their respective values from the credentials.json file.

export TRAINML_USER=<'user' field from credentials.json>
export TRAINML_KEY=<'key' field from credentials.json>
python create_job.py

Environment variables will override any credentials stored in ~/.trainml/credentials.json

Runtime Variables

API credentials can also be passed directly to the TrainML object constructor at runtime.

import trainml
trainml = trainml.TrainML(user="user field from credentials.json",key="key field from credentials.json>")
await trainml.jobs.create(...)

Passing credentials to the TrainML constructor will override all other methods for setting credentials.

Configuration

By default, all operations using the trainML SDK/CLI will use the Personal project for trainML account the API keys were generated from. To change the active project, run the configure command:

trainml configure

This command will output the currently configured active project (UNSET defaults to Personal) and allows you to specify any project you have access to as the new active project.

Current Active Project: Personal
Select Active Project: (My Other Project, Personal, Project Shared With Me) [Personal]:

Once you select a project, it will store the results of your selection in the config.json file in the TRAINML_CONFIG_DIR folder (~/.trainml by default). Once the active project is set, all subsequent operations will use the selected project.

This setting can also be overridden at runtime using the environment variable TRAINML_PROJECT:

TRAINML_PROJECT=<PROJECT ID> python create_job.py

or by instantiating the trainml client with the project keyword argument:

import trainml
trainml = trainml.TrainML(project="PROJECT ID")
await trainml.jobs.create(...)

You must specify the project ID (not name) when using the runtime options. The project ID can be found by running trainml project list.

Usage

Python SDK

The trainML SDK utilizes the asyncio library to ease the concurrent execution of long running tasks. An example of how to create a dataset from an S3 bucket and immediately run a training job on that dataset is the following:

from trainml.trainml import TrainML
import asyncio


trainml_client = TrainML()

# Create the dataset
dataset = asyncio.run(
    trainml_client.datasets.create(
        name="Example Dataset",
        source_type="aws",
        source_uri="s3://trainml-examples/data/cifar10",
    )
)

print(dataset)

# Watch the log output, attach will return when data transfer is complete
asyncio.run(dataset.attach())

# Create the job
job = asyncio.run(
    trainml_client.jobs.create(
        name="Example Training Job",
        type="training",
        gpu_type="GTX 1060",
        gpu_count=1,
        disk_size=10,
        workers=[
            "PYTHONPATH=$PYTHONPATH:$TRAINML_MODEL_PATH python -m official.vision.image_classification.resnet_cifar_main --num_gpus=1 --data_dir=$TRAINML_DATA_PATH --model_dir=$TRAINML_OUTPUT_PATH --enable_checkpoint_and_export=True --train_epochs=10 --batch_size=1024",
        ],
        data=dict(
            datasets=[dict(id=dataset.id, type="existing")],
            output_uri="s3://trainml-examples/output/resnet_cifar10",
            output_type="aws",
        ),
        model=dict(git_uri="git@github.com:trainML/test-private.git"),
    )
)
print(job)

# Watch the log output, attach will return when the training job stops
asyncio.run(job.attach())

# Cleanup job and dataset
asyncio.run(job.remove())
asyncio.run(dataset.remove())

See more examples in the examples folder

Command Line Interface

The command line interface is rooted in the trainml command. To see the available options, run:

trainml --help

To list all jobs:

trainml job list

To list all datasets:

trainml dataset list

To connect to a job that requires the connection capability:

trainml job connect <job ID or name>

To watch the realtime job logs:

trainml job attach <job ID or name>

To create and open a notebook job:

trainml job create notebook "My Notebook Job"

To create a multi-GPU notebook job on a specific GPU type with larger scratch directory space:

trainml job create notebook --gpu-type "RTX 3090" --gpu-count 4 --disk-size 50 "My Notebook Job"

To run the model training code in the train.py file in your local ~/model-code directory on the training data in your local ~/data directory:

trainml job create training --model-dir ~/model-code --data-dir ~/data "My Training Job" "python train.py"

Stop a job by job ID:

trainml job stop fe52527c-1f4b-468f-b57d-86db864cc089

Stop a job by name:

trainml job stop "My Notebook Job"

Restart a notebook job:

trainml job start "My Notebook Job"

Remove a job by job ID:

trainml job remove fe52527c-1f4b-468f-b57d-86db864cc089