Let me start by saying that I had zero knowledge about ML before 1 week. I also didn't write a single line of Python code before I was exposed to ML. So if you see or think that I am doing something really stupide or writing some "noob" code - sorry,
So what I am going to do.
I an going to predict the prices of the goods by using Linear Regression ML approach. For this I am going to use the following
1. Dataset with basic feature engineering
2. Dataset with more advanced feature engineering
3. Xgboost, Linear Learner and SageMaker autopilot
My Dataset looks like this
%%time import os import boto3 import re import sagemaker # Get a SageMaker-compatible role used by this Notebook Instance. role = sagemaker.get_execution_role() region = boto3.Session().region_name ### update below values appropriately ### bucket = sagemaker.Session().default_bucket() prefix = "sagemaker/xgboost-no-fe" #### print(region)
-----------------------------------------------------------------------------------
%%time import io import boto3 import random #split the data to training, validation and testing sets random.seed(42) def data_split( FILE_DATA, DATA_DIR, FILE_TRAIN_BASE, FILE_TRAIN_1, FILE_VALIDATION, FILE_TEST, PERCENT_TRAIN_0, PERCENT_TRAIN_1, PERCENT_VALIDATION, PERCENT_TEST, ): data = [l for l in open(FILE_DATA, "r")] train_file_0 = open(DATA_DIR + "/" + FILE_TRAIN_0, "w") train_file_1 = open(DATA_DIR + "/" + FILE_TRAIN_1, "w") valid_file = open(DATA_DIR + "/" + FILE_VALIDATION, "w") tests_file = open(DATA_DIR + "/" + FILE_TEST, "w") num_of_data = len(data) num_train_0 = int((PERCENT_TRAIN_0 / 100.0) * num_of_data) num_train_1 = int((PERCENT_TRAIN_1 / 100.0) * num_of_data) num_valid = int((PERCENT_VALIDATION / 100.0) * num_of_data) num_tests = int((PERCENT_TEST / 100.0) * num_of_data) data_fractions = [num_train_0, num_train_1, num_valid, num_tests] split_data = [[], [], [], []] rand_data_ind = 0 for split_ind, fraction in enumerate(data_fractions): for i in range(fraction): rand_data_ind = random.randint(0, len(data) - 1) split_data[split_ind].append(data[rand_data_ind]) data.pop(rand_data_ind) for l in split_data[0]: train_file_0.write(l) for l in split_data[1]: train_file_1.write(l) for l in split_data[2]: valid_file.write(l) for l in split_data[3]: tests_file.write(l) train_file_0.close() train_file_1.close() valid_file.close() tests_file.close() def write_to_s3(fobj, bucket, key): return ( boto3.Session(region_name=region) .resource("s3") .Bucket(bucket) .Object(key) .upload_fileobj(fobj) ) def upload_to_s3(bucket, channel, filename): fobj = open(filename, "rb") key = prefix + "/" + channel url = "s3://{}/{}/{}".format(bucket, key, filename) print("Writing to {}".format(url)) write_to_s3(fobj, bucket, key)-----------------------------------------------------------------------------------
%%time import pandas as pd s3 = boto3.client("s3") # Load the dataset from s3 FILE_DATA = "pricing.csv" s3.download_file( "pricing-sagemaker", f"xgboost/data_no_fe_with_no_header.csv", FILE_DATA ) data=pd.read_csv(FILE_DATA) data.dropna(inplace=True) data.to_csv(FILE_DATA, sep=",", index=False) # split the downloaded data into train/test/validation files FILE_TRAIN_0 = "pricing.train_0" FILE_TRAIN_1 = "pricing.train_1" FILE_VALIDATION = "pricing.validation" FILE_TEST = "pricing.test" PERCENT_TRAIN_0 = 35 PERCENT_TRAIN_1 = 35 PERCENT_VALIDATION = 15 PERCENT_TEST = 15 DATA_DIR = "data" if not os.path.exists(DATA_DIR): os.mkdir(DATA_DIR) data_split( FILE_DATA, DATA_DIR, FILE_TRAIN_0, FILE_TRAIN_1, FILE_VALIDATION, FILE_TEST, PERCENT_TRAIN_0, PERCENT_TRAIN_1, PERCENT_VALIDATION, PERCENT_TEST, )---------------------------------------------------------------------------------------
# upload the files to the S3 bucket upload_to_s3(bucket, "train/train_0.csv", DATA_DIR + "/" + FILE_TRAIN_0) upload_to_s3(bucket, "train/train_1.csv", DATA_DIR + "/" + FILE_TRAIN_1) upload_to_s3(bucket, "validation/validation.csv", DATA_DIR + "/" + FILE_VALIDATION) upload_to_s3(bucket, "test/test.csv", DATA_DIR + "/" + FILE_TEST)
--------------------------------------------------------------------------------------
instance_type = "ml.m5.2xlarge" output_path = "s3://{}/{}/{}/output".format(bucket, prefix, "pricing-dist-xgb") # Xgboost supports several content types. We will use CSV content_type = "text/csv"--------------------------------------------------------------------------------------
import sagemaker import boto3 from sagemaker import image_uris from sagemaker.session import Session from sagemaker.inputs import TrainingInput # initialize hyperparameters hyperparams = { "max_depth": "7", "eta": "0.2", "gamma": "4", "min_child_weight": "6", "subsample": "0.7", "objective": "reg:squarederror", "num_round": "100", "eval_metric":"rmse", "verbosity": "2", } # set an output path where the trained model will be saved output_path = 's3://{}/{}/{}/output'.format(bucket, prefix, 'pricing-xgb-built-in-algo') # this line automatically looks for the XGBoost image URI and builds an XGBoost container. # specify the repo_version depending on your preference. xgboost_container = sagemaker.image_uris.retrieve("xgboost", region, "1.2-2") # construct a SageMaker estimator that calls the xgboost-container estimator = sagemaker.estimator.Estimator(image_uri=xgboost_container, hyperparameters=hyperparams, role=sagemaker.get_execution_role(), instance_count=1, instance_type='ml.m5.2xlarge', volume_size=5, # 5 GB output_path=output_path) train_input = TrainingInput( "s3://{}/{}/{}/".format(bucket, prefix, "train"), content_type=content_type ) validation_input = TrainingInput( "s3://{}/{}/{}/".format(bucket, prefix, "validation"), content_type=content_type ) #this one actually does the job by training the model estimator.fit({"train": train_input, "validation": validation_input})
# deploy command will create a rest api end poing predictor = estimator.deploy( initial_instance_count=1, instance_type="ml.m5.2xlarge", endpoint_name='Xgboost-basic' )
-------------------------------------------------------------------------------------
# read the test data. We want to predict the price column, so we need to remove it with all the data it has data_train=pd.read_csv(DATA_DIR + "/" + FILE_TEST) data_train.columns = [ "ItemPrice", "BranchNum", "ItemCode", "ItemDiscount", "year", "month", "day", "hour" ] #Remove item price column since we want to predict it data_train = data_train.drop(["ItemPrice"], axis=1)------------------------------------------------------------------------------------
# just show the data import io from io import StringIO csv_file = io.StringIO() data_train = data_train.iloc[1: , :] # by default sagemaker expects comma seperated data data_train.to_csv(csv_file, sep=",", header=False, index=False) # the payload size is limited so we will use only 1000 records data_train =data_train.head(1000) payload = csv_file.getvalue() print(payload )-----------------------------------------------------------------------------------
# run the prediction and print the output runtime_client = boto3.client("runtime.sagemaker", region_name=region) response = runtime_client.invoke_endpoint( EndpointName=predictor.endpoint_name, ContentType="text/csv", Body=payload ) result = response["Body"].read().decode("ascii") print("Predicted values are {}.".format(result))
-------------------------------------------------------------------------------------I am going to post the link to the notebook and s3 datafile at the end of this post.
# run the prediction and print the output runtime_client = boto3.client("runtime.sagemaker", region_name=region) response = runtime_client.invoke_endpoint( EndpointName=predictor.endpoint_name, ContentType="text/csv", Body='003,8690784516778,0,2022,03,15,17' ) result = response["Body"].read().decode("ascii") print("Predicted values are {}.".format(result))
66.95
13.7315.08
s3.download_file( "pricing-sagemaker", f"xgboost/data_fe_with_no_header.csv", FILE_DATA )
data_train=pd.read_csv(DATA_DIR + "/" + FILE_TEST) data_train.columns = [ "ItemPrice", "ItemCode", "ItemDiscount", "year", "month", "day", "hour", "Is1", "Is2", "Is3", "Is5", "Is6", "Is7", "Is8", "Is9", "Is10", "Is12", "Is13", "Is14", "Is15", "Is16", "Is50", "Is334" ] #Remove item price column since we want to predict it data_train = data_train.drop(["ItemPrice"], axis=1)
------------------------------------------------------------------------------------
87.33
14.9214.62Original Values Basic Feature Eng. Advanced Feature Eng.109.9 66.95 87.338.9 13.73 14.9214.9 15.08 14.62Except the last value it doesn't looks to good. In the part 2 I will try the same with Linear Learner algorithm. Maybe it will do a better jobAll files from this blog can be found here