Source code for musif.process.filter
from typing import Dict, Union
from pandas import DataFrame
import pandas as pd
import os
from musif.extract.features.core.constants import FILE_NAME
# from musif.extract.features.intervals.constants import FILE_NAME
import matplotlib.pyplot as plt
from musif.logs import perr, pinfo
import numpy as np
[docs]class DataFilter:
"""Processor class that treats columns and information of a DataFrame
This operator processes information from a DataFrame or a .csv file.
It deletes unseful columns and merges those that are required to clean the data.
The main method .process() returns a DataFrame and saves it into a .csv file.
Requires to have a Passions.csv file in the current working directory containing each passion
for each aria.
...
Attributes
----------
data : DataFrame
DataFrame extracted with FeaturesExtractor containing all info.
info: str
Path to .csv file or Dataframe containing the information from FeaturesExtractor
Methods
-------
"""
def __init__(self, info: Union[str, DataFrame], *args, **kwargs):
"""
Parameters
----------
*args: str
Could be a path to a .yml file, a PostProcess_Configuration object or a dictionary. Length zero or one.
*kwargs : str
Key words arguments to construct
kwargs[info]: Union[str, DataFrame]
Either a path to a .csv file containing the information either a DataFrame object fromm FeaturesExtractor
"""
# self._post_config=PostProcess_Configuration(*args, **kwargs)
self.info = info
self.data = self._process_info(self.info)
def _process_info(self, info: Union[str, DataFrame]) -> DataFrame:
"""
Extracts the info from a directory to a csv file or from a Dataframe object.
Parameters
------
info: str
Info in the from of str (path to csv file) or DataFrame
Raises
------
FileNotFoundError
If path to the .csv file is not found.
Returns
------
Dataframe with the information from either the file or the previous DataFrame.
"""
try:
if isinstance(info, str):
pinfo("\nReading csv file...")
if not os.path.exists(info):
raise FileNotFoundError
self.destination_route = info.replace(".csv", "")
df = pd.read_csv(
info, low_memory=False, sep=",", encoding_errors="replace"
)
if df.empty:
raise FileNotFoundError
return df
elif isinstance(info, DataFrame):
pinfo("\nProcessing DataFrame...")
return info
else:
perr(
"Wrong info type! You must introduce either a DataFrame either the name of a .csv file."
)
return pd.DataFrame()
except FileNotFoundError:
perr(
"Data could not be loaded. Either wrong path or an empty file was found."
)
return pd.DataFrame()
[docs] def filter_data(
self, by: str = None, equal_to: list = [], instrument: str = ""
) -> DataFrame:
data = self.data.loc[self.data[by].isin(equal_to)]
percentages_intervals = pd.DataFrame()
for aria in sorted(set(data[by])):
aria_data = data[data[by] == aria]
percentages = self._filter_intervals(aria_data, instrument)
# percentages=self._filter_stepwise(aria_data, instrument)
# percentages_intervals[aria]=percentages
percentages[by] = aria
percentages_intervals = percentages_intervals.append(
percentages, ignore_index=True
)
percentages_intervals = self.post_process_intervals(
by, instrument, percentages_intervals
)
print(percentages_intervals)
return percentages_intervals
[docs] def post_process_intervals(self, by, instrument, percentages_intervals: DataFrame):
for column in percentages_intervals.filter(like="Interval").columns:
if max(percentages_intervals[column]) == 0.0:
del percentages_intervals[column]
percentages_intervals = percentages_intervals.reindex(
sorted(percentages_intervals.columns), axis=1
)
percentages_intervals.columns = [
i.replace(instrument, "")
.replace("_Percentage", "")
.replace("_", " ")
.replace("Interval", "")
for i in percentages_intervals.columns
]
excel_name = "intervals_per_aria.xlsx"
percentages_intervals.to_excel(excel_name)
pinfo(f"File saved succesfully as: {excel_name}")
percentages_intervals.plot(x=by, kind="bar")
plt.savefig("intervals.png")
plt.show()
return percentages_intervals
def _filter_intervals(self, aria_data, instrument) -> Dict[str, float]:
# interval_types = [i for i in aria_data if 'Intervals' in i and instrument in i]
interval = [
i
for i in aria_data
if "_Interval" in i and instrument in i and not "Intervals" in i
]
interval_counts = [i for i in interval if "Count" in i]
intervals_dataframe = aria_data[interval_counts]
total = intervals_dataframe.sum(axis=0).sum()
totals = {}
for column in intervals_dataframe:
totals[column.replace("_Count", "") + "_Percentage"] = (
np.nansum(intervals_dataframe[column]) / total
) * 100
return totals
def _filter_stepwise(self, aria_data, instrument) -> Dict[str, float]:
# interval_types = [i for i in aria_data if 'Intervals' in i and instrument in i]
stepwise = [i for i in aria_data if "stepwise" in i.lower() and instrument in i]
stepwise_counts = [i for i in stepwise if "Count" in i]
steps_dataframe = aria_data[stepwise_counts]
total = steps_dataframe.sum(axis=0).sum()
totals = {}
for column in steps_dataframe:
totals[column.replace("_Count", "") + "_Percentage"] = (
np.nansum(steps_dataframe[column]) / total
) * 100
return totals