ORP 数据整理
🔖 python
🔖 academic
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
dir_path = os.path.dirname(__file__) # directory path
file_path = os.path.join(dir_path, "Raw data") # dataset file path
file_list = os.listdir(file_path)
# toc: 合并 csv 文件
# 删除非 csv 文件
for file in file_list:
if ".CSV" not in file:
file_list.remove(file)
res = pd.DataFrame([]) # 合并后的结果
for file in file_list:
tmp_file_path = os.path.join(file_path, file)
# 数据分隔
dataset = (
pd.read_csv(tmp_file_path, header=None, index_col=None)
.iloc[:, 0]
.str.split(";", expand=True)
)
dataset.columns = dataset.iloc[0, :] # 保留表头名称
dataset = dataset.loc[1:, ["Date/Time", "Value"]] # 提取出值
dataset.columns = ["Date", "Value"] # 更改表头名称
dataset.set_index("Date", inplace=True) # 设置index为 Date 列
res = pd.concat([res, dataset], axis=0) # 合并当前 csv 到结果文件中
res.index = pd.to_datetime(res.index).strftime("%Y-%m-%d %H:%M")
# 处理重复值
res = res.sort_index(ascending=True)
res = res[~res.index.duplicated(keep="first")] # 删除重复的index
# toc: 保存含有缺失值的结果
res_path = os.path.join(dir_path, "res_isna.csv")
res.to_csv(res_path) # file: 输出结果
# toc: 处理缺失值
dir_path = os.path.dirname(__file__) # directory path
file_path = os.path.join(dir_path, "res_isna.csv") # dataset file path
dataset = pd.read_csv(file_path, index_col=[0], header=[0], parse_dates=True) # 读取文件
date_range = pd.date_range(dataset.index.min(), dataset.index.max(), freq="min")
res = pd.DataFrame(range(date_range.size), index=date_range)
res = pd.concat([res, dataset], axis=1, join="outer")["Value"]
print("缺失值有 {num} 个:\n".format(num=res[res.isna()].shape[0]), res[res.isna()])
res = res.fillna(method="ffill") # 去除缺失值
# toc: 保存无缺失值的结果
res.index.name = "Date"
res_path = os.path.join(dir_path, "res_nona.csv")
res.to_csv(res_path) # file: 输出结果
# toc: 做图
dir_path = os.path.dirname(__file__) # directory path
file_path = os.path.join(dir_path, "res_nona.csv") # dataset file path
dataset = pd.read_csv(file_path, index_col=[0], header=[0], parse_dates=True) # 读取文件
year_li = set(list(map(lambda x: x.year, dataset.index))) # 有多少个年份有数据
print("当前年份有:", year_li)
# 对图做标注
line_li = []
for i in range(4):
line_li.append(1 + 6 * i) # 本循环结束后
line_li.append(2.5 + 1 + 6 * i) # 厌气后
line_li.append(5.0 + 1 + 6 * i) # 好气后
# 分别对每个月进行做图
for year in year_li:
month_li = set(list(map(lambda x: x.month, dataset.index))) # 有多少个月有数据
print("当前月份有:", month_li)
for month in month_li:
month_dataset = dataset[dataset.index.month == month] # 提取出当前月数据
day_li = list(set(list(map(lambda x: x.day, month_dataset.index)))) # 本月有多少天有数据
print("{month}月有数据的日期为:".format(month=month), day_li)
# ORP curve
fig_1 = plt.figure(figsize=(8 * 2.5, 6 * len(day_li)))
axes = fig_1.subplots(len(day_li))
# ORP extremum
fig_2 = plt.figure(figsize=(8 * 2, 6)) # 每日最大和最小ORP
ax_extremum = fig_2.add_subplot(111)
max_ORP, min_ORP = [], []
# 分别对每一天进行做图
for j, day in enumerate(day_li):
day_dataset = month_dataset[month_dataset.index.day == day]
day_dataset[
day_dataset.index.hour.isin([0, 6, 12, 18])
] = pd.NA #! 将闲置的时间值设置为空置
max_ORP.append(day_dataset["Value"].max()) # 本日最大ORP
min_ORP.append(day_dataset["Value"].min()) # 本日最小ORP
# fig_1
# 判断起始时间
start_hour, end_hour = day_dataset.index.hour[0], day_dataset.index.hour[-1]
start_minute, end_minute = (
day_dataset.index.minute[0],
day_dataset.index.minute[-1],
)
x = np.linspace(
start_hour + start_minute / 60,
end_hour + end_minute / 60,
day_dataset.shape[0],
)
# 开始绘图
axes[j].plot(x, day_dataset.values, color="green", linewidth=2)
axes[j].set_xlim(-0.5, 24.5)
axes[j].set_ylim(-300, 600)
locator_x_major = plt.MultipleLocator(1)
axes[j].xaxis.set_major_locator(locator_x_major)
axes[j].set_xlabel("Elapsed time (hour)")
axes[j].set_ylabel("ORP (mV)")
axes[j].set_title(
"{date}".format(date=day_dataset.index[0].strftime("%Y-%m-%d"))
)
axes[j].tick_params(axis="both", which="major", direction="inout")
# 标注线
for line in line_li:
if line % 6 == 1:
axes[j].axvline(x=line, linestyle="--", color="red", linewidth=2)
else:
axes[j].axvline(
x=line, linestyle="--", color="black", linewidth=0.3
)
# axes[j].axhline(
# y=day_dataset.values.min(), linestyle="--", linewidth=0.3
# ) # 画出当日最小值
# axes[j].axhline(
# y=day_dataset.values.max(), linestyle="--", linewidth=0.3
# ) # 画出当日最大值
# fig_2
markersize = 10
linewidth = 0.5
alpha = 1
color_li = [(85 / 255, 187 / 255, 194 / 255), (232 / 255, 125 / 255, 115 / 255)]
data_li = [max_ORP, min_ORP]
for k, ORP in enumerate(data_li):
ax_extremum.plot(
day_li,
ORP,
marker="o",
color="black",
markersize=markersize,
markerfacecolor=color_li[k],
markeredgecolor="black",
markeredgewidth=1,
linewidth=linewidth,
linestyle="--",
alpha=alpha,
)
ax_extremum.set_xlim(-0.3, month_dataset.index[0].days_in_month + 0.3)
ax_extremum.set_ylim(-300, 700)
locator_x_major = plt.MultipleLocator(2) # 设置主刻度
ax_extremum.xaxis.set_major_locator(locator_x_major)
locator_y_major = plt.MultipleLocator(100) # 设置主刻度
ax_extremum.yaxis.set_major_locator(locator_y_major)
ax_extremum.set_xlabel("Elapsed time (day)")
ax_extremum.set_ylabel("ORP (mV)")
ax_extremum.set_title(
"{date}".format(date=month_dataset.index[0].strftime("%Y-%m"))
)
ax_extremum.tick_params(axis="both", which="major", direction="inout")
tmp_x = 0.7
tmp_y = 20 # 标注的偏移量
for i, value in enumerate(max_ORP):
ax_extremum.text(day_li[i] - tmp_x, value + tmp_y, s=value)
for i, value in enumerate(min_ORP):
ax_extremum.text(day_li[i] - tmp_x, value + tmp_y, s=value)
# toc: 保存图片
fig_1_path = os.path.join(
dir_path, "Curve_{year}_{month}.pdf".format(year=year, month=month)
)
fig_1.savefig(fig_1_path)
fig_2_path = os.path.join(
dir_path, "Min_max_{year}_{month}.pdf".format(year=year, month=month)
)
fig_2.savefig(fig_2_path)