This commit is contained in:
2024-09-13 16:58:34 +08:00
commit b06a9ecee0
103 changed files with 5440 additions and 0 deletions

View File

@@ -0,0 +1,4 @@
from PytorchBoot.runners.trainer import DefaultTrainer
from PytorchBoot.runners.evaluator import DefaultEvaluator
from PytorchBoot.runners.predictor import DefaultPredictor
from PytorchBoot.runners.runner import Runner

Binary file not shown.

Binary file not shown.

View File

@@ -0,0 +1,132 @@
import os
import json
import torch
from tqdm import tqdm
from PytorchBoot.config import ConfigManager
import PytorchBoot.namespace as namespace
import PytorchBoot.stereotype as stereotype
from PytorchBoot.factory import ComponentFactory
from PytorchBoot.dataset import BaseDataset
from PytorchBoot.runners.runner import Runner
from PytorchBoot.utils import Log
@stereotype.runner("default_evaluator")
class DefaultEvaluator(Runner):
def __init__(self, config_path):
super().__init__(config_path)
''' Pipeline '''
self.pipeline_name = self.config[namespace.Stereotype.PIPELINE]
self.pipeline = ComponentFactory.create(namespace.Stereotype.PIPELINE, self.pipeline_name)
self.pipeline:torch.nn.Module = self.pipeline.to(self.device)
''' Experiment '''
self.model_path = self.config["experiment"]["model_path"]
self.load_experiment("default_evaluator")
''' Test '''
self.test_config = ConfigManager.get(namespace.Stereotype.RUNNER, namespace.Mode.TEST)
self.test_dataset_name_list = self.test_config["dataset_list"]
self.test_set_list = []
self.test_writer_list = []
seen_name = set()
for test_dataset_name in self.test_dataset_name_list:
if test_dataset_name not in seen_name:
seen_name.add(test_dataset_name)
else:
raise ValueError("Duplicate test dataset name: {}".format(test_dataset_name))
test_set: BaseDataset = ComponentFactory.create(namespace.Stereotype.DATASET, test_dataset_name)
self.test_set_list.append(test_set)
self.print_info()
def run(self):
eval_result = self.test()
self.save_eval_result(eval_result)
def test(self):
self.pipeline.eval()
eval_result = {}
with torch.no_grad():
test_set: BaseDataset
for dataset_idx, test_set in enumerate(self.test_set_list):
test_set_config = test_set.get_config()
eval_list = test_set_config["eval_list"]
ratio = test_set_config["ratio"]
test_set_name = test_set.get_name()
output_list = []
data_list = []
test_loader = test_set.get_loader()
loop = tqdm(enumerate(test_loader), total=int(len(test_loader)))
for _, data in loop:
test_set.process_batch(data, self.device)
data["mode"] = namespace.Mode.TEST
output = self.pipeline(data)
output_list.append(output)
data_list.append(data)
loop.set_description(
f'Evaluating [{dataset_idx+1}/{len(self.test_set_list)}] (Test: {test_set_name}, ratio={ratio})')
result_dict = self.eval_fn(output_list, data_list, eval_list)
eval_result[test_set_name] = result_dict
return eval_result
def save_eval_result(self, eval_result):
result_dir = os.path.join(str(self.experiment_path), namespace.Direcotry.RESULT_DIR_NAME)
eval_result_path = os.path.join(result_dir, self.file_name + "_eval_result.json")
with open(eval_result_path, "w") as f:
json.dump(eval_result, f, indent=4)
Log.success(f"Saved evaluation result to {eval_result_path}")
@staticmethod
def eval_fn(output_list, data_list, eval_list):
collected_result = {}
for eval_method_name in eval_list:
eval_method = ComponentFactory.create(namespace.Stereotype.EVALUATION_METHOD, eval_method_name)
eval_results:dict = eval_method.evaluate(output_list, data_list)
for data_type, eval_result in eval_results.items():
if data_type not in collected_result:
collected_result[data_type] = {}
for name, value in eval_result.items():
collected_result[data_type][name] = value
return collected_result
def load_checkpoint(self):
self.load(self.model_path)
Log.success(f"Loaded checkpoint from {self.model_path}")
def load_experiment(self, backup_name=None):
super().load_experiment(backup_name)
self.load_checkpoint()
def create_experiment(self, backup_name=None):
super().create_experiment(backup_name)
result_dir = os.path.join(str(self.experiment_path), namespace.Direcotry.RESULT_DIR_NAME)
os.makedirs(result_dir)
def load(self, path):
state_dict = torch.load(path)
self.pipeline.load_state_dict(state_dict)
def print_info(self):
def print_dataset(dataset: BaseDataset):
config = dataset.get_config()
name = dataset.get_name()
Log.blue(f"Dataset: {name}")
for k,v in config.items():
Log.blue(f"\t{k}: {v}")
super().print_info()
table_size = 70
Log.blue(f"{'+' + '-' * (table_size // 2)} Pipeline {'-' * (table_size // 2)}" + '+')
Log.blue(self.pipeline)
Log.blue(f"{'+' + '-' * (table_size // 2)} Datasets {'-' * (table_size // 2)}" + '+')
for i, test_set in enumerate(self.test_set_list):
Log.blue(f"test dataset {i}: ")
print_dataset(test_set)
Log.blue(f"{'+' + '-' * (table_size // 2)}----------{'-' * (table_size // 2)}" + '+')

View File

@@ -0,0 +1,128 @@
import os
import json
import torch
from tqdm import tqdm
from PytorchBoot.config import ConfigManager
import PytorchBoot.namespace as namespace
import PytorchBoot.stereotype as stereotype
from PytorchBoot.factory import ComponentFactory
from PytorchBoot.dataset import BaseDataset
from PytorchBoot.runners.runner import Runner
from PytorchBoot.utils import Log
@stereotype.runner("default_predictor")
class DefaultPredictor(Runner):
def __init__(self, config_path):
super().__init__(config_path)
''' Pipeline '''
self.pipeline_name = self.config[namespace.Stereotype.PIPELINE]
self.pipeline = ComponentFactory.create(namespace.Stereotype.PIPELINE, self.pipeline_name)
self.pipeline:torch.nn.Module = self.pipeline.to(self.device)
''' Experiment '''
self.model_path = self.config["experiment"]["model_path"]
self.load_experiment("default_predictor")
self.save_original_data = self.config["experiment"]["save_original_data"]
''' Testset '''
self.test_config = ConfigManager.get(namespace.Stereotype.RUNNER, namespace.Mode.TEST)
self.test_dataset_name_list = self.test_config["dataset_list"]
self.test_set_list = []
self.test_writer_list = []
seen_name = set()
for test_dataset_name in self.test_dataset_name_list:
if test_dataset_name not in seen_name:
seen_name.add(test_dataset_name)
else:
raise ValueError("Duplicate test dataset name: {}".format(test_dataset_name))
test_set: BaseDataset = ComponentFactory.create(namespace.Stereotype.DATASET, test_dataset_name)
self.test_set_list.append(test_set)
self.print_info()
def run(self):
predict_result = self.predict()
self.save_predict_result(predict_result)
def predict(self):
self.pipeline.eval()
predict_result = {}
with torch.no_grad():
test_set: BaseDataset
for dataset_idx, test_set in enumerate(self.test_set_list):
test_set_config = test_set.get_config()
ratio = test_set_config["ratio"]
test_set_name = test_set.get_name()
output_list = []
data_list = []
test_loader = test_set.get_loader()
loop = tqdm(enumerate(test_loader), total=int(len(test_loader)))
for _, data in loop:
test_set.process_batch(data, self.device)
data["mode"] = namespace.Mode.TEST
output = self.pipeline(data)
output_list.append(output)
data_list.append(data)
loop.set_description(
f'Predicting [{dataset_idx+1}/{len(self.test_set_list)}] (Test: {test_set_name}, ratio={ratio})')
predict_result[test_set_name] = {
"output": output_list,
"data": data_list
}
return predict_result
def save_predict_result(self, predict_result):
result_dir = os.path.join(str(self.experiment_path), namespace.Direcotry.RESULT_DIR_NAME, self.file_name+"_predict_result")
os.makedirs(result_dir)
for test_set_name in predict_result.keys():
os.mkdir(os.path.join(result_dir, test_set_name))
idx = 0
for output, data in zip(predict_result[test_set_name]["output"], predict_result[test_set_name]["data"]):
output_path = os.path.join(result_dir, test_set_name, f"output_{idx}.pth")
torch.save(output, output_path)
if self.save_original_data:
data_path = os.path.join(result_dir, test_set_name, f"data_{idx}.pth")
torch.save(data, data_path)
idx += 1
Log.success(f"Saved predict result of {test_set_name} to {result_dir}")
Log.success(f"Saved all predict result to {result_dir}")
def load_checkpoint(self):
self.load(self.model_path)
Log.success(f"Loaded checkpoint from {self.model_path}")
def load_experiment(self, backup_name=None):
super().load_experiment(backup_name)
self.load_checkpoint()
def create_experiment(self, backup_name=None):
super().create_experiment(backup_name)
result_dir = os.path.join(str(self.experiment_path), namespace.Direcotry.RESULT_DIR_NAME)
os.makedirs(result_dir)
def load(self, path):
state_dict = torch.load(path)
self.pipeline.load_state_dict(state_dict)
def print_info(self):
def print_dataset(dataset: BaseDataset):
config = dataset.get_config()
name = dataset.get_name()
Log.blue(f"Dataset: {name}")
for k,v in config.items():
Log.blue(f"\t{k}: {v}")
super().print_info()
table_size = 70
Log.blue(f"{'+' + '-' * (table_size // 2)} Pipeline {'-' * (table_size // 2)}" + '+')
Log.blue(self.pipeline)
Log.blue(f"{'+' + '-' * (table_size // 2)} Datasets {'-' * (table_size // 2)}" + '+')
for i, test_set in enumerate(self.test_set_list):
Log.blue(f"test dataset {i}: ")
print_dataset(test_set)
Log.blue(f"{'+' + '-' * (table_size // 2)}----------{'-' * (table_size // 2)}" + '+')

View File

@@ -0,0 +1,61 @@
import os
import time
from abc import abstractmethod, ABC
import numpy as np
import torch
from PytorchBoot.config import ConfigManager
from PytorchBoot.utils.log_util import Log
class Runner(ABC):
@abstractmethod
def __init__(self, config_path):
ConfigManager.load_config_with(config_path)
ConfigManager.print_config()
self.config = ConfigManager.get("runner")
self.seed = self.config["general"]["seed"]
self.device = self.config["general"]["device"]
self.cuda_visible_devices = self.config["general"]["cuda_visible_devices"]
os.environ["CUDA_VISIBLE_DEVICES"] = self.cuda_visible_devices
self.experiments_config = self.config["experiment"]
self.experiment_path = os.path.join(self.experiments_config["root_dir"], self.experiments_config["name"])
np.random.seed(self.seed)
torch.manual_seed(self.seed)
lt = time.localtime()
self.file_name = f"{lt.tm_year}_{lt.tm_mon}_{lt.tm_mday}_{lt.tm_hour}h{lt.tm_min}m{lt.tm_sec}s"
@abstractmethod
def run(self):
pass
@abstractmethod
def load_experiment(self, backup_name=None):
if not os.path.exists(self.experiment_path):
Log.info(f"experiments environment {self.experiments_config['name']} does not exists.")
self.create_experiment(backup_name)
else:
Log.info(f"experiments environment {self.experiments_config['name']}")
backup_config_dir = os.path.join(str(self.experiment_path), "configs")
if not os.path.exists(backup_config_dir):
os.makedirs(backup_config_dir)
ConfigManager.backup_config_to(backup_config_dir, self.file_name, backup_name)
@abstractmethod
def create_experiment(self, backup_name=None):
Log.info("creating experiment: " + self.experiments_config["name"])
os.makedirs(self.experiment_path)
backup_config_dir = os.path.join(str(self.experiment_path), "configs")
os.makedirs(backup_config_dir)
ConfigManager.backup_config_to(backup_config_dir, self.file_name, backup_name)
log_dir = os.path.join(str(self.experiment_path), "log")
os.makedirs(log_dir)
cache_dir = os.path.join(str(self.experiment_path), "cache")
os.makedirs(cache_dir)
def print_info(self):
table_size = 80
Log.blue("+" + "-" * table_size + "+")
Log.blue(f"| Experiment <{self.experiments_config['name']}>")
Log.blue("+" + "-" * table_size + "+")

View File

@@ -0,0 +1,266 @@
import os
import json
from datetime import datetime
import torch
from tqdm import tqdm
from torch.utils.tensorboard import SummaryWriter
from PytorchBoot.config import ConfigManager
import PytorchBoot.namespace as namespace
import PytorchBoot.stereotype as stereotype
from PytorchBoot.factory import ComponentFactory
from PytorchBoot.factory import OptimizerFactory
from PytorchBoot.dataset import BaseDataset
from PytorchBoot.runners.runner import Runner
from PytorchBoot.utils.tensorboard_util import TensorboardWriter
from PytorchBoot.stereotype import EXTERNAL_FRONZEN_MODULES
from PytorchBoot.utils import Log
from PytorchBoot.status import status_manager
@stereotype.runner("default_trainer")
class DefaultTrainer(Runner):
def __init__(self, config_path):
super().__init__(config_path)
tensorboard_path = os.path.join(self.experiment_path, namespace.Direcotry.TENSORBOARD_DIR_NAME)
''' Pipeline '''
self.pipeline_name = self.config[namespace.Stereotype.PIPELINE]
self.parallel = self.config["general"]["parallel"]
self.pipeline:torch.nn.Module = ComponentFactory.create(namespace.Stereotype.PIPELINE, self.pipeline_name)
if self.parallel and self.device == "cuda":
self.pipeline = torch.nn.DataParallel(self.pipeline)
self.pipeline = self.pipeline.to(self.device)
''' Experiment '''
self.current_epoch = 0
self.current_iter = 0
self.max_epochs = self.experiments_config["max_epochs"]
self.test_first = self.experiments_config["test_first"]
self.load_experiment("default_trainer")
''' Train '''
self.train_config = ConfigManager.get(namespace.Stereotype.RUNNER, namespace.Mode.TRAIN)
self.train_dataset_name= self.train_config["dataset"]
self.train_set: BaseDataset = ComponentFactory.create(namespace.Stereotype.DATASET, self.train_dataset_name)
self.optimizer = OptimizerFactory.create(self.train_config["optimizer"], self.pipeline.parameters())
self.train_writer = SummaryWriter(
log_dir=os.path.join(tensorboard_path, f"[{namespace.Mode.TRAIN}]{self.train_dataset_name}"))
''' Test '''
self.test_config = ConfigManager.get(namespace.Stereotype.RUNNER, namespace.Mode.TEST)
self.test_dataset_name_list = self.test_config["dataset_list"]
self.test_set_list = []
self.test_writer_list = []
seen_name = set()
for test_dataset_name in self.test_dataset_name_list:
if test_dataset_name not in seen_name:
seen_name.add(test_dataset_name)
else:
raise ValueError("Duplicate test dataset name: {}".format(test_dataset_name))
test_set: BaseDataset = ComponentFactory.create(namespace.Stereotype.DATASET, test_dataset_name)
test_writer = SummaryWriter(
log_dir=os.path.join(tensorboard_path, f"[test]{test_dataset_name}"))
self.test_set_list.append(test_set)
self.test_writer_list.append(test_writer)
self.print_info()
def run(self):
save_interval = self.experiments_config["save_checkpoint_interval"]
if self.current_epoch != 0:
Log.info("Continue training from epoch {}.".format(self.current_epoch))
else:
Log.info("Start training from initial model.")
if self.test_first:
Log.info("Do test first.")
self.test()
while self.current_epoch < self.max_epochs:
self.current_epoch += 1
status_manager.set_progress("train", "default_trainer", "Epoch", self.current_epoch, self.max_epochs)
self.train()
self.test()
if self.current_epoch % save_interval == 0:
self.save_checkpoint()
self.save_checkpoint(is_last=True)
def train(self):
self.pipeline.train()
train_set_name = self.train_dataset_name
config = self.train_set.get_config()
train_loader = self.train_set.get_loader(shuffle=True)
total=len(train_loader)
loop = tqdm(enumerate(train_loader), total=total)
for i, data in loop:
status_manager.set_progress("train", "default_trainer", f"(train) Batch[{train_set_name}]", i+1, total)
self.train_set.process_batch(data, self.device)
loss_dict = self.train_step(data)
loop.set_description(
f'Epoch [{self.current_epoch}/{self.max_epochs}] (Train: {train_set_name}, ratio={config["ratio"]})')
loop.set_postfix(loss=loss_dict)
for loss_name, loss in loss_dict.items():
status_manager.set_status("train", "default_trainer", f"[loss]{loss_name}", loss)
TensorboardWriter.write_tensorboard(self.train_writer, "iter", loss_dict, self.current_iter, simple_scalar=True)
self.current_iter += 1
def train_step(self, data):
self.optimizer.zero_grad()
data["mode"] = namespace.Mode.TRAIN
output = self.pipeline(data)
total_loss, loss_dict = self.loss_fn(output, data)
total_loss.backward()
self.optimizer.step()
for k, v in loss_dict.items():
loss_dict[k] = round(v, 5)
return loss_dict
def loss_fn(self, output, data):
loss_name_list = self.train_config["losses"]
loss_dict = {}
total_loss = torch.tensor(0.0, dtype=torch.float32, device=self.device)
for loss_name in loss_name_list:
target_loss_fn = ComponentFactory.create(namespace.Stereotype.LOSS_FUNCTION, loss_name)
loss = target_loss_fn.compute(output, data)
loss_dict[loss_name] = loss.item()
total_loss += loss
loss_dict['total_loss'] = total_loss.item()
return total_loss, loss_dict
def test(self):
self.pipeline.eval()
with torch.no_grad():
test_set: BaseDataset
for dataset_idx, test_set in enumerate(self.test_set_list):
test_set_config = test_set.get_config()
eval_list = test_set_config["eval_list"]
ratio = test_set_config["ratio"]
test_set_name = test_set.get_name()
writer = self.test_writer_list[dataset_idx]
output_list = []
data_list = []
test_loader = test_set.get_loader()
total=int(len(test_loader))
loop = tqdm(enumerate(test_loader), total=total)
for i, data in loop:
status_manager.set_progress("train", "default_trainer", f"(test) Batch[{test_set_name}]", i+1, total)
test_set.process_batch(data, self.device)
data["mode"] = namespace.Mode.TEST
output = self.pipeline(data)
output_list.append(output)
data_list.append(data)
loop.set_description(
f'Epoch [{self.current_epoch}/{self.max_epochs}] (Test: {test_set_name}, ratio={ratio})')
result_dict = self.eval_fn(output_list, data_list, eval_list)
TensorboardWriter.write_tensorboard(writer, "epoch", result_dict, self.current_epoch - 1)
@staticmethod
def eval_fn(output_list, data_list, eval_list):
collected_result = {}
for eval_method_name in eval_list:
eval_method = ComponentFactory.create(namespace.Stereotype.EVALUATION_METHOD, eval_method_name)
eval_results:dict = eval_method.evaluate(output_list, data_list)
for data_type, eval_result in eval_results.items():
if data_type not in collected_result:
collected_result[data_type] = {}
for name, value in eval_result.items():
collected_result[data_type][name] = value
status_manager.set_status("train", "default_trainer", f"[eval]{name}", value)
return collected_result
def get_checkpoint_path(self, is_last=False):
return os.path.join(self.experiment_path, namespace.Direcotry.CHECKPOINT_DIR_NAME,
"Epoch_{}.pth".format(
self.current_epoch if self.current_epoch != -1 and not is_last else "last"))
def load_checkpoint(self, is_last=False):
self.load(self.get_checkpoint_path(is_last))
Log.success(f"Loaded checkpoint from {self.get_checkpoint_path(is_last)}")
if is_last:
checkpoint_root = os.path.join(self.experiment_path, namespace.Direcotry.CHECKPOINT_DIR_NAME)
meta_path = os.path.join(checkpoint_root, "meta.json")
if not os.path.exists(meta_path):
raise FileNotFoundError(
"No checkpoint meta.json file in the experiment {}".format(self.experiments_config["name"]))
file_path = os.path.join(checkpoint_root, "meta.json")
with open(file_path, "r") as f:
meta = json.load(f)
self.current_epoch = meta["last_epoch"]
self.current_iter = meta["last_iter"]
def save_checkpoint(self, is_last=False):
self.save(self.get_checkpoint_path(is_last))
if not is_last:
Log.success(f"Checkpoint at epoch {self.current_epoch} saved to {self.get_checkpoint_path(is_last)}")
else:
meta = {
"last_epoch": self.current_epoch,
"last_iter": self.current_iter,
"time": str(datetime.now())
}
checkpoint_root = os.path.join(self.experiment_path, namespace.Direcotry.CHECKPOINT_DIR_NAME)
file_path = os.path.join(checkpoint_root, "meta.json")
with open(file_path, "w") as f:
json.dump(meta, f)
def load_experiment(self, backup_name=None):
super().load_experiment(backup_name)
if self.experiments_config["use_checkpoint"]:
self.current_epoch = self.experiments_config["epoch"]
self.load_checkpoint(is_last=(self.current_epoch == -1))
def create_experiment(self, backup_name=None):
super().create_experiment(backup_name)
ckpt_dir = os.path.join(str(self.experiment_path), namespace.Direcotry.CHECKPOINT_DIR_NAME)
os.makedirs(ckpt_dir)
tensorboard_dir = os.path.join(str(self.experiment_path), namespace.Direcotry.TENSORBOARD_DIR_NAME)
os.makedirs(tensorboard_dir)
def load(self, path):
state_dict = torch.load(path)
if self.parallel:
self.pipeline.module.load_state_dict(state_dict)
else:
self.pipeline.load_state_dict(state_dict)
def save(self, path):
if self.parallel:
state_dict = self.pipeline.module.state_dict()
else:
state_dict = self.pipeline.state_dict()
for name, module in self.pipeline.named_modules():
if module.__class__ in EXTERNAL_FRONZEN_MODULES:
if name in state_dict:
del state_dict[name]
torch.save(state_dict, path)
def print_info(self):
def print_dataset(dataset: BaseDataset):
config = dataset.get_config()
name = dataset.get_name()
Log.blue(f"Dataset: {name}")
for k,v in config.items():
Log.blue(f"\t{k}: {v}")
super().print_info()
table_size = 70
Log.blue(f"{'+' + '-' * (table_size // 2)} Pipeline {'-' * (table_size // 2)}" + '+')
Log.blue(self.pipeline)
Log.blue(f"{'+' + '-' * (table_size // 2)} Datasets {'-' * (table_size // 2)}" + '+')
Log.blue("train dataset: ")
print_dataset(self.train_set)
for i, test_set in enumerate(self.test_set_list):
Log.blue(f"test dataset {i}: ")
print_dataset(test_set)
Log.blue(f"{'+' + '-' * (table_size // 2)}----------{'-' * (table_size // 2)}" + '+')

View File

@@ -0,0 +1,17 @@
from abc import abstractmethod, ABC
from PytorchBoot.config import ConfigManager
from PytorchBoot.runners import Runner
from PytorchBoot.utils import Log
class WebRunner(ABC, Runner):
@abstractmethod
def __init__(self, config_path):
ConfigManager.load_config_with(config_path)
@abstractmethod
def run(self):
pass