allegroai
在做试验时,需要画图,需要统计性能,查看试验的进度,如果全部都靠jupyter,多少有些繁琐,因此想要找一些pipeline工具,最好可以可视化查看,因此调研了allegroai,如下。
trains是开源的python组件,包括Trains Client, Trains Server, Trains agent
一、Trains Server
如果不指定,默认会将结果上传到官方的demo网站,因此需要自己安装配置Server
### 安装
需要安装docker-compose
git 路径: https://github.com/allegroai/trains-server
安装需要注意的是:
1. chmod -R 777 /opt/trains/
2. 需要将docker-compose.yml放到/opt/trains/
配置
server会有一些配置
https://allegro.ai/docs/deploying_trains/trains_server_config/
默认的配置文件 trains-server/server/config/default/
例如,对登录网站的用户名和密码设置/opt/trains/config/apiserver.conf
![](https://img.laitimes.com/img/9ZDMuAjOiMmIsIjOiQnIsICM38FdsYkRGZkRG9lcvx2bjxiNx8VZ6l2cs0TP31UMZRUTzklaNBDOsJGcohVYsR2MMBjVtJWd0ckW65UbM5WOHJWa5kHT20ESjBjUIF2X0hXZ0xCMx81dvRWYoNHLrdEZwZ1Rh5WNXp1bwNjW1ZUba9VZwlHdssmch1mclRXY39CXldWYtlWPzNXZj9mcw1ycz9WL49zZuBnLwETO4UTM0YTMxIDOwAjMwIzLc52YucWbp5GZzNmLn9Gbi1yZtl2Lc9CX6MHc0RHaiojIsJye.png)
设置完之后重启
docker-compose down
docker-compose -f docker-compose.yml up -d
二、Trains Python Client
A. 配置使用某台服务器搭建的Server
不使用默认的demo wep ui,而是自己配置的server,以下2种方法
1. trains-init
然后按照步骤去操作
2. ~/trains.conf
api {
# API server on port 8008
api_server: "http://localhost:8008"
# web_server on port 8080
web_server: "http://localhost:8080"
# file server on port 8081
files_server: "http://localhost:8081"
}
B. 安装
server提供了可视化的Web UI,我们只需在自己的脚本中添加几行代码,即可。
pip install trains
C. 使用例子
函数说明文档API:https://allegro.ai/docs/logger.html
举例说明,以下面脚本为例
https://github.com/allegroai/trains/blob/master/examples/frameworks/pytorch/pytorch_mnist.py
from trains import Task
task = Task.init(project_name="my project", task_name="my task")
1. output 指定输出路径
Task.init()
参数 output_uri = model_snapshots_path
```
task = Task.init(project_name='pkl examples', task_name='pkl train',output_uri='/allegro/trians/')
```
脚本运行之后,会创建以下路径
+-- <output destination name>
| +-- <project name>
| +-- <task name>.<Task Id>
| +-- models
| +-- artifacts
2. logger 日志:文字或画图
可视化绘图
logger = task.get_logger()
logger.report_histogram()
logger.report_text()
有时候想要将dataframe也上传到plot上,会更好看一点
from trains import Task
task = Task.init(project_name='Training', task_name='pandas table reporting')
logger = task.get_logger()
logger.report_table("table pd", "table pd",iteration=0, table_plot=df)
task.close()
结果会在result标签下的plots子标签
3. register artifact 会上传到server,change也会上传
artifact可以认为是将dataframe上传到server查看
# Register the test loss and correct as a Pandas DataFrame artifact
task.register_artifact('Test_Loss_Correct', df, metadata={'metadata string': 'apple',
'metadata int': 100, 'metadata dict': {'dict string': 'pear', 'dict int': 200}})
# Once the artifact is registered, we can get it and work with it. Here, we sample it.
sample = Task.current_task().get_registered_artifacts()['Test_Loss_Correct'].sample(frac=0.5,
replace=True, random_state=1)
4. task.upload_artifact() 会上传到server,但是change不会
三、实例
以下是个完整的例子和结果
# TRAINS - Example of Pytorch mnist training integration
#
from __future__ import print_function
import argparse
import os
from tempfile import gettempdir
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
import pandas as pd
from torchvision import datasets, transforms
from trains import Task, Logger
class Net(nn.Module):
def __init__(self):
super(Net, self).__init__()
self.conv1 = nn.Conv2d(1, 20, 5, 1)
self.conv2 = nn.Conv2d(20, 50, 5, 1)
self.fc1 = nn.Linear(4 * 4 * 50, 500)
self.fc2 = nn.Linear(500, 10)
def forward(self, x):
x = F.relu(self.conv1(x))
x = F.max_pool2d(x, 2, 2)
x = F.relu(self.conv2(x))
x = F.max_pool2d(x, 2, 2)
x = x.view(-1, 4 * 4 * 50)
x = F.relu(self.fc1(x))
x = self.fc2(x)
return F.log_softmax(x, dim=1)
def train(args, model, device, train_loader, optimizer, epoch):
model.train()
for batch_idx, (data, target) in enumerate(train_loader):
data, target = data.to(device), target.to(device)
optimizer.zero_grad()
output = model(data)
loss = F.nll_loss(output, target)
loss.backward()
optimizer.step()
if batch_idx % args.log_interval == 0:
####以下是scalar
Logger.current_logger().report_scalar(
"train", "loss", iteration=(epoch * len(train_loader) + batch_idx), value=loss.item())
print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
epoch, batch_idx * len(data), len(train_loader.dataset),
100. * batch_idx / len(train_loader), loss.item()))
# Add manual scalar reporting for loss metrics
Logger.current_logger().report_scalar(title='Scalar example {} - epoch'.format(epoch), series='Loss', value=loss.item(), iteration=batch_idx)
def test(args, model, device, test_loader, epoch):
model.eval()
test_loss = 0
correct = 0
save_test_loss = []
save_correct = []
with torch.no_grad():
for data, target in test_loader:
data, target = data.to(device), target.to(device)
output = model(data)
test_loss += F.nll_loss(output, target, reduction='sum').item() # sum up batch loss
pred = output.argmax(dim=1, keepdim=True) # get the index of the max log-probability
correct += pred.eq(target.view_as(pred)).sum().item()
save_correct.append(correct)
save_test_loss.append(test_loss)
test_loss /= len(test_loader.dataset)
#####以下是scalar
Logger.current_logger().report_scalar(
"test", "loss", iteration=epoch, value=test_loss)
Logger.current_logger().report_scalar(
"test", "accuracy", iteration=epoch, value=(correct / len(test_loader.dataset)))
print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
test_loss, correct, len(test_loader.dataset),
100. * correct / len(test_loader.dataset)))
###以下是画图
Logger.current_logger().report_histogram(title='Histogram example', series='correct',iteration=1, values=save_correct, xaxis='Test', yaxis='Correct')
# Manually report test loss and correct as a confusion matrix
matrix = np.array([save_test_loss, save_correct])
Logger.current_logger().report_confusion_matrix(title='Confusion matrix example', series='Test loss / correct', matrix=matrix, iteration=1)
# Create the Pandas DataFrame
test_loss_correct = {
'test lost': save_test_loss,
'correct': save_correct
}
df = pd.DataFrame(test_loss_correct, columns=['test lost','correct'])
##可以将dataframe上传到server查看
# Register the test loss and correct as a Pandas DataFrame artifact
Task.current_task().register_artifact('Test_Loss_Correct', df, metadata={'metadata string': 'apple', 'metadata int': 100, 'metadata dict': {'dict string': 'pear', 'dict int': 200}})
# Upload test loss as an artifact. Here, the artifact is numpy array
Task.current_task().upload_artifact('Predictions',artifact_object=np.array(save_test_loss),
metadata={'metadata string': 'banana', 'metadata integer': 300,
'metadata dictionary': {'dict string': 'orange', 'dict int': 400}})
def main():
task = Task.init(project_name='pkl examples', task_name='pkl train',output_uri='/allegro/trians/')
# Training settings
parser = argparse.ArgumentParser(description='PyTorch MNIST Example')
parser.add_argument('--batch-size', type=int, default=64, metavar='N',
help='input batch size for training (default: 64)')
parser.add_argument('--test-batch-size', type=int, default=1000, metavar='N',
help='input batch size for testing (default: 1000)')
parser.add_argument('--epochs', type=int, default=10, metavar='N',
help='number of epochs to train (default: 10)')
parser.add_argument('--lr', type=float, default=0.01, metavar='LR',
help='learning rate (default: 0.01)')
parser.add_argument('--momentum', type=float, default=0.5, metavar='M',
help='SGD momentum (default: 0.5)')
parser.add_argument('--no-cuda', action='store_true', default=False,
help='disables CUDA training')
parser.add_argument('--seed', type=int, default=1, metavar='S',
help='random seed (default: 1)')
parser.add_argument('--log-interval', type=int, default=10, metavar='N',
help='how many batches to wait before logging training status')
parser.add_argument('--save-model', action='store_true', default=True,
help='For Saving the current Model')
args = parser.parse_args()
use_cuda = not args.no_cuda and torch.cuda.is_available()
torch.manual_seed(args.seed)
device = torch.device("cuda" if use_cuda else "cpu")
kwargs = {'num_workers': 4, 'pin_memory': True} if use_cuda else {}
train_loader = torch.utils.data.DataLoader(
datasets.MNIST(os.path.join('..', 'data'), train=True, download=True,
transform=transforms.Compose([
transforms.ToTensor(),
transforms.Normalize((0.1307,), (0.3081,))
])),
batch_size=args.batch_size, shuffle=True, **kwargs)
test_loader = torch.utils.data.DataLoader(
datasets.MNIST(os.path.join('..', 'data'), train=False, transform=transforms.Compose([
transforms.ToTensor(),
transforms.Normalize((0.1307,), (0.3081,))
])),
batch_size=args.test_batch_size, shuffle=True, **kwargs)
model = Net().to(device)
optimizer = optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum)
for epoch in range(1, args.epochs + 1):
train(args, model, device, train_loader, optimizer, epoch)
test(args, model, device, test_loader, epoch)
if (args.save_model):
torch.save(model.state_dict(), os.path.join(gettempdir(), "mnist_cnn.pt"))
if __name__ == '__main__':
main()
可以通过网址: http://IP:8080/ 查看脚本运行的进度,以及log等
查看 artifacts
查看log
查看scalars
查看plots,可交互图