ORP 数据整理

🔖 python
🔖 academic
Author

Guangyao Zhao

Published

Feb 7, 2023

import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

dir_path = os.path.dirname(__file__)  # directory path
file_path = os.path.join(dir_path, "Raw data")  # dataset file path
file_list = os.listdir(file_path)

# toc: 合并 csv 文件
# 删除非 csv 文件
for file in file_list:
    if ".CSV" not in file:
        file_list.remove(file)

res = pd.DataFrame([])  # 合并后的结果
for file in file_list:
    tmp_file_path = os.path.join(file_path, file)
    # 数据分隔
    dataset = (
        pd.read_csv(tmp_file_path, header=None, index_col=None)
        .iloc[:, 0]
        .str.split(";", expand=True)
    )
    dataset.columns = dataset.iloc[0, :]  # 保留表头名称
    dataset = dataset.loc[1:, ["Date/Time", "Value"]]  # 提取出值
    dataset.columns = ["Date", "Value"]  # 更改表头名称
    dataset.set_index("Date", inplace=True)  # 设置index为 Date 列

    res = pd.concat([res, dataset], axis=0)  # 合并当前 csv 到结果文件中
res.index = pd.to_datetime(res.index).strftime("%Y-%m-%d %H:%M")

# 处理重复值
res = res.sort_index(ascending=True)
res = res[~res.index.duplicated(keep="first")]  # 删除重复的index

# toc: 保存含有缺失值的结果
res_path = os.path.join(dir_path, "res_isna.csv")
res.to_csv(res_path)  # file: 输出结果

# toc: 处理缺失值
dir_path = os.path.dirname(__file__)  # directory path
file_path = os.path.join(dir_path, "res_isna.csv")  # dataset file path
dataset = pd.read_csv(file_path, index_col=[0], header=[0], parse_dates=True)  # 读取文件

date_range = pd.date_range(dataset.index.min(), dataset.index.max(), freq="min")
res = pd.DataFrame(range(date_range.size), index=date_range)
res = pd.concat([res, dataset], axis=1, join="outer")["Value"]
print("缺失值有 {num} 个:\n".format(num=res[res.isna()].shape[0]), res[res.isna()])
res = res.fillna(method="ffill")  # 去除缺失值

# toc: 保存无缺失值的结果
res.index.name = "Date"
res_path = os.path.join(dir_path, "res_nona.csv")
res.to_csv(res_path)  # file: 输出结果

# toc: 做图
dir_path = os.path.dirname(__file__)  # directory path
file_path = os.path.join(dir_path, "res_nona.csv")  # dataset file path
dataset = pd.read_csv(file_path, index_col=[0], header=[0], parse_dates=True)  # 读取文件
year_li = set(list(map(lambda x: x.year, dataset.index)))  # 有多少个年份有数据
print("当前年份有:", year_li)

# 对图做标注
line_li = []
for i in range(4):
    line_li.append(1 + 6 * i)  # 本循环结束后
    line_li.append(2.5 + 1 + 6 * i)  # 厌气后
    line_li.append(5.0 + 1 + 6 * i)  # 好气后

# 分别对每个月进行做图
for year in year_li:
    month_li = set(list(map(lambda x: x.month, dataset.index)))  # 有多少个月有数据
    print("当前月份有:", month_li)
    for month in month_li:
        month_dataset = dataset[dataset.index.month == month]  # 提取出当前月数据
        day_li = list(set(list(map(lambda x: x.day, month_dataset.index))))  # 本月有多少天有数据
        print("{month}月有数据的日期为:".format(month=month), day_li)

        # ORP curve
        fig_1 = plt.figure(figsize=(8 * 2.5, 6 * len(day_li)))
        axes = fig_1.subplots(len(day_li))

        # ORP extremum
        fig_2 = plt.figure(figsize=(8 * 2, 6))  # 每日最大和最小ORP
        ax_extremum = fig_2.add_subplot(111)
        max_ORP, min_ORP = [], []

        # 分别对每一天进行做图
        for j, day in enumerate(day_li):
            day_dataset = month_dataset[month_dataset.index.day == day]
            day_dataset[
                day_dataset.index.hour.isin([0, 6, 12, 18])
            ] = pd.NA  #! 将闲置的时间值设置为空置
            max_ORP.append(day_dataset["Value"].max())  # 本日最大ORP
            min_ORP.append(day_dataset["Value"].min())  # 本日最小ORP

            # fig_1
            # 判断起始时间
            start_hour, end_hour = day_dataset.index.hour[0], day_dataset.index.hour[-1]
            start_minute, end_minute = (
                day_dataset.index.minute[0],
                day_dataset.index.minute[-1],
            )
            x = np.linspace(
                start_hour + start_minute / 60,
                end_hour + end_minute / 60,
                day_dataset.shape[0],
            )
            # 开始绘图
            axes[j].plot(x, day_dataset.values, color="green", linewidth=2)

            axes[j].set_xlim(-0.5, 24.5)
            axes[j].set_ylim(-300, 600)
            locator_x_major = plt.MultipleLocator(1)
            axes[j].xaxis.set_major_locator(locator_x_major)
            axes[j].set_xlabel("Elapsed time (hour)")
            axes[j].set_ylabel("ORP (mV)")
            axes[j].set_title(
                "{date}".format(date=day_dataset.index[0].strftime("%Y-%m-%d"))
            )
            axes[j].tick_params(axis="both", which="major", direction="inout")

            # 标注线
            for line in line_li:
                if line % 6 == 1:
                    axes[j].axvline(x=line, linestyle="--", color="red", linewidth=2)
                else:
                    axes[j].axvline(
                        x=line, linestyle="--", color="black", linewidth=0.3
                    )

            # axes[j].axhline(
            #     y=day_dataset.values.min(), linestyle="--", linewidth=0.3
            # )  # 画出当日最小值

            # axes[j].axhline(
            #     y=day_dataset.values.max(), linestyle="--", linewidth=0.3
            # )  # 画出当日最大值

        # fig_2
        markersize = 10
        linewidth = 0.5
        alpha = 1
        color_li = [(85 / 255, 187 / 255, 194 / 255), (232 / 255, 125 / 255, 115 / 255)]
        data_li = [max_ORP, min_ORP]
        for k, ORP in enumerate(data_li):
            ax_extremum.plot(
                day_li,
                ORP,
                marker="o",
                color="black",
                markersize=markersize,
                markerfacecolor=color_li[k],
                markeredgecolor="black",
                markeredgewidth=1,
                linewidth=linewidth,
                linestyle="--",
                alpha=alpha,
            )

        ax_extremum.set_xlim(-0.3, month_dataset.index[0].days_in_month + 0.3)
        ax_extremum.set_ylim(-300, 700)
        locator_x_major = plt.MultipleLocator(2)  # 设置主刻度
        ax_extremum.xaxis.set_major_locator(locator_x_major)
        locator_y_major = plt.MultipleLocator(100)  # 设置主刻度
        ax_extremum.yaxis.set_major_locator(locator_y_major)
        ax_extremum.set_xlabel("Elapsed time (day)")
        ax_extremum.set_ylabel("ORP (mV)")
        ax_extremum.set_title(
            "{date}".format(date=month_dataset.index[0].strftime("%Y-%m"))
        )
        ax_extremum.tick_params(axis="both", which="major", direction="inout")

        tmp_x = 0.7
        tmp_y = 20  # 标注的偏移量
        for i, value in enumerate(max_ORP):
            ax_extremum.text(day_li[i] - tmp_x, value + tmp_y, s=value)

        for i, value in enumerate(min_ORP):
            ax_extremum.text(day_li[i] - tmp_x, value + tmp_y, s=value)

        # toc: 保存图片
        fig_1_path = os.path.join(
            dir_path, "Curve_{year}_{month}.pdf".format(year=year, month=month)
        )
        fig_1.savefig(fig_1_path)

        fig_2_path = os.path.join(
            dir_path, "Min_max_{year}_{month}.pdf".format(year=year, month=month)
        )
        fig_2.savefig(fig_2_path)