# SPDX-FileCopyrightText: 2024 Helmholtz-Zentrum Dresden-Rossendorf (HZDR)
# SPDX-License-Identifier: MIT

from pandas import read_csv, to_datetime, DataFrame
from pathlib import Path
from math import nan
import label

def prepare_data(data_file: Path) -> DataFrame:
    """ Load a file containing weather data and make it ready to use.

        The file might be compressed but is expected to hold data
        in the ISDLite format. The contained data will be labelled and cleaned,
        so it is ready to use. Labels are taken from the `label`-module

        Args:
            data_file: A path to an exisiting data file that is to be loaded

        Returns:
            A fully labelled and cleaned pandas data frame
    """

    # Loading the data
    # No header in data file, use one or more whitespaces as column separator
    weather_data = read_csv(data_file, header=None, sep=r"\s+")

    # Set the proper column labels
    # IMPORTANT: this only works so nicely
    # because the columns are provided in the correct order!
    weather_data.columns = label.DATE_TIME_COLUMNS + label.MEASUREMENT_COLUMNS

    # Combine the date and time columns into one, correctly parsing date and time
    weather_data[label.INDEX_COLUMN] = to_datetime(
        weather_data[label.DATE_TIME_COLUMNS]
    )

    weather_data = weather_data.set_index(label.INDEX_COLUMN)

    # We don't need the separate date and time columns anymore
    weather_data = weather_data.drop(label.DATE_TIME_COLUMNS, axis="columns")

    # Replace the missing value indicator -9999 by nan
    weather_data = weather_data.replace({-9999: nan})

    # Undo the column scaling
    weather_data[label.SCALED_COLUMNS] = weather_data[label.SCALED_COLUMNS] / 10

    return weather_data
