作者:Pratik Bhavsar
import os from abc import ABCMeta, abstractmethod class DataProcessor(metaclass=ABCMeta): """Base processor to be used for all preparation.""" def __init__(self, input_directory, output_directory): self.input_directory = input_directory self.output_directory = output_directory @abstractmethod def read(self): """Read raw data.""" @abstractmethod def process(self): """Processes raw data. This step should create the raw dataframe with all the required features. Shouldn't implement statistical or text cleaning.""" @abstractmethod def save(self): """Saves processed data.""" class Trainer(metaclass=ABCMeta): """Base trainer to be used for all models.""" def __init__(self, directory): self.directory = directory self.model_directory = os.path.join(directory, 'models') @abstractmethod def preprocess(self): """This takes the preprocessed data and returns clean data. This is more about statistical or text cleaning.""" @abstractmethod def set_model(self): """Define model here.""" @abstractmethod def fit_model(self): """This takes the vectorised data and returns a trained model.""" @abstractmethod def generate_metrics(self): """Generates metric with trained model and test data.""" @abstractmethod def save_model(self, model_name): """This method saves the model in our required format.""" class Predict(metaclass=ABCMeta): """Base predictor to be used for all models.""" def __init__(self, directory): self.directory = directory self.model_directory = os.path.join(directory, 'models') @abstractmethod def load_model(self): """Load model here.""" @abstractmethod def preprocess(self): """This takes the raw data and returns clean data for prediction.""" @abstractmethod def predict(self): """This is used for prediction.""" class BaseDB(metaclass=ABCMeta): """ Base database class to be used for all DB connectors.""" @abstractmethod def get_connection(self): """This creates a new DB connection.""" @abstractmethod def close_connection(self): """This closes the DB connection."""实验的可重复性是非常重要的,而种子是我们的敌人。抓住它,否则会导致不同的训练/测试数据分割和不同的权值初始化神经网络。这导致了不一致的结果。
def set_seed(args): random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if args.n_gpu > 0: torch.cuda.manual_seed_all(args.seed)如果你的数据太大,而你的工作是代码的后面的部分,如清理数据或建模,那么可以使用nrows来避免每次加载巨大的数据。当你只想测试代码而不实际运行整个代码时,请使用此方法。
df_train = pd.read_csv(‘train.csv’, nrows=1000)一定要检查数据中的NA,因为这些会给你以后带来问题。即使你当前的数据没有,这并不意味着它不会在未来的再训练循环中发生。所以无论如何????继续检查。
print(len(df)) df.isna().sum() df.dropna() print(len(df))当你在处理大数据时,知道它将花费多少时间以及我们在整个处理过程中的位置肯定会让你感觉很好。
选项 1 — tqdm
from tqdm import tqdm import time tqdm.pandas() df['col'] = df['col'].progress_apply(lambda x: x**2) text = "" for char in tqdm(["a", "b", "c", "d"]): time.sleep(0.25) text = text + char选项 2 — fastprogress
from fastprogress.fastprogress import master_bar, progress_bar from time import sleep mb = master_bar(range(10)) for i in mb: for j in progress_bar(range(100), parent=mb): sleep(0.01) mb.child.comment = f'second bar stat' mb.first_bar.comment = f'first bar stat' mb.write(f'Finished loop {i}.')如果你使用过pandas,你就会知道有时它有多慢 —— 尤其是groupby。不用打破头寻找“伟大的”解决方案加速,只需使用modin改变一行代码就可以了。
import modin.pandas as pd不是所有的函数都是生而平等的
import time def timing(f): """Decorator for timing functions Usage: @timing def function(a): pass """ @wraps(f) def wrapper(*args, **kwargs): start = time.time() result = f(*args, **kwargs) end = time.time() print('function:%r took: %2.2f sec' % (f.__name__, end - start)) return result return wrapper没有人喜欢浪费云资源的工程师。
但是将主代码包装在try中,此方法也包装在except中 —— 这样如果发生错误,服务器就不会继续运行。是的,我也处理过这些情况????
import os def run_command(cmd): return os.system(cmd) def shutdown(seconds=0, os='linux'): """Shutdown system after seconds given. Useful for shutting EC2 to save costs.""" if os == 'linux': run_command('sudo shutdown -h -t sec %s' % seconds) elif os == 'windows': run_command('shutdown -s -t %s' % seconds)在建模的某个特定点之后,所有伟大的见解都只来自错误和度量分析。确保为自己和你的管理层创建和保存格式良好的报告。
import json import os from sklearn.metrics import (accuracy_score, classification_report, confusion_matrix, f1_score, fbeta_score) def get_metrics(y, y_pred, beta=2, average_method='macro', y_encoder=None): if y_encoder: y = y_encoder.inverse_transform(y) y_pred = y_encoder.inverse_transform(y_pred) return { 'accuracy': round(accuracy_score(y, y_pred), 4), 'f1_score_macro': round(f1_score(y, y_pred, average=average_method), 4), 'fbeta_score_macro': round(fbeta_score(y, y_pred, beta, average=average_method), 4), 'report': classification_report(y, y_pred, output_dict=True), 'report_csv': classification_report(y, y_pred, output_dict=False).replace('\n','\r\n') } def save_metrics(metrics: dict, model_directory, file_name): path = os.path.join(model_directory, file_name + '_report.txt') classification_report_to_csv(metrics['report_csv'], path) metrics.pop('report_csv') path = os.path.join(model_directory, file_name + '_metrics.json') json.dump(metrics, open(path, 'w'), indent=4)所有的结果都是坏的。All that ends bad is bad.
fasbut + uvicorn
Fastest — 使用fastapi编写API,因为它很快。
Documentation — 用fastapi写API让我们不用操心文档。
Workers — 使用uvicorn部署API
pip install fastapi uvicorn uvicorn main:app --workers 4 --host --port 8000—END—