Source code for musif.process.utils

from logging.config import dictConfig
import re
from typing import List

import numpy as np
import pandas as pd
from musif.config import (
    ENDSWITH,
    INSTRUMENTS_TO_DELETE,
    STARTSWITH,
    SUBSTRING_TO_DELETE,
)
from musif.extract.features.core.constants import FILE_NAME
from musif.extract.basic_modules.scoring.constants import (
    FAMILY_INSTRUMENTATION,
    FAMILY_SCORING,
    VOICES,
)
from musif.extract.features.harmony.constants import CHORD_prefix
from musif.extract.features.texture.constants import TEXTURE
from musif.extract.features.ambitus.constants import (
    HIGHEST_NOTE_INDEX,
    LOWEST_NOTE_INDEX,
)
from musif.extract.features.harmony.constants import (
    KEY_MODULATORY,
    KEY_PERCENTAGE,
    KEY_PREFIX,
)
from musif.extract.features.interval.constants import (
    REPEATED_NOTES_COUNT,
    TRIMMED_INTERVALLIC_MEAN,
)
from musif.extract.features.prefix import get_part_prefix, get_sound_prefix
from musif.extract.features.scale.constants import DEGREE_PREFIX

from musif.logs import pinfo
from pandas import DataFrame
from tqdm import tqdm

from .constants import voices_list_prefixes


[docs]def replace_nans(df): for col in df.columns: if ( "Interval" in col or col.startswith("Key_") or col.startswith((CHORD_prefix, "Chords_", "Additions_", "Numerals_")) or col.endswith(("_DottedRhythm", "_DoubleDottedRhythm")) or ("_Degree" and TRIMMED_INTERVALLIC_MEAN and "_Dyn") in col ): df[col] = df[col].fillna("NA")
[docs]def merge_duetos_trios(df: DataFrame) -> None: generic_sound_voice_prefix = get_sound_prefix("Voice") df = df[df[VOICES].notna()] multiple_voices = df[df[VOICES].str.contains(",")] multiple_voices = _remove_repeated_voices(multiple_voices) pinfo( f"{multiple_voices.shape[0]} arias were found with duetos/trietos. Calculating averages." ) voice_cols = [ col for col in df.columns.values if any(voice in col for voice in voices_list_prefixes) ] for index in tqdm(multiple_voices.index): name = df.at[index, FILE_NAME] all_voices = df.at[index, VOICES].split(",") if all(x == all_voices[0] for x in all_voices): continue pinfo(f"\nMerging dueto/trieto {name}") first_voice = all_voices[0] columns_to_merge = [i for i in voice_cols if first_voice in i.lower()] for col in columns_to_merge: similar_cols = [] formatted_col = col.replace( get_part_prefix(first_voice), generic_sound_voice_prefix ) for j in range(0, len(all_voices)): similar_col = col.replace( get_part_prefix(first_voice), get_part_prefix(all_voices[j]) ) if similar_col in df: similar_cols.append(similar_col) if all(isinstance(x, str) for x in df.loc[index, similar_cols]): df.at[index, formatted_col] = df.loc[index, similar_cols][0] elif all(np.isnan(x) for x in df.loc[index, similar_cols]): df.drop(similar_cols, inplace=True, axis=1) elif HIGHEST_NOTE_INDEX in col or ("Largest" and "Asc") in col: df.at[index, formatted_col] = df.loc[index, similar_cols].max() elif LOWEST_NOTE_INDEX in col or ("Largest" and "Desc") in col: df.at[index, formatted_col] = df.loc[index, similar_cols].min() else: df.at[index, formatted_col] = df.loc[index, similar_cols].mean() return df
def _remove_repeated_voices(multiple_voices): repeated_voices_indexes = [] for i, row in multiple_voices.iterrows(): if all(x == row[VOICES].split(",")[0] for x in row[VOICES].split(",")): repeated_voices_indexes.append(i) multiple_voices = multiple_voices[ ~multiple_voices.index.isin(repeated_voices_indexes) ] return multiple_voices
[docs]def merge_single_voices(df: DataFrame) -> None: generic_sound_voice_prefix = get_sound_prefix("Voice") pinfo("\nJoining voice parts...") singer_columns = [ i for i in df.columns.values if any(voice in i for voice in voices_list_prefixes) ] for col in singer_columns: singer_part = col.split("_")[0] generic_col = "_".join(col.split("_")[1:]) formatted_col = col.replace(singer_part + "_", generic_sound_voice_prefix) if formatted_col in df: continue columns_to_merge = [ i for i in singer_columns if "_".join(i.split("_")[1:]) == generic_col ] if all(df[columns_to_merge].dtypes == object): df[columns_to_merge] = df[columns_to_merge].astype(str) df[formatted_col] = df[columns_to_merge].sum(axis=1) df[formatted_col] = [i.replace("nan", "") for i in df[formatted_col]] else: for colum in columns_to_merge: df[colum].fillna(0, inplace=True) df[formatted_col] = df[columns_to_merge].sum(axis=1)
def _join_double_bass(df: DataFrame): df.drop([i for i in df.columns if "PartBsII" in i], axis=1, inplace=True) double_bass_columns = [i for i in df.columns if "PartBsI" in i] for col in double_bass_columns: formatted_col = col.replace("BsI_", "Bs_") df[formatted_col].fillna(0, inplace=True) if df[formatted_col].dtypes == object: df[formatted_col] = df[formatted_col].astype(str) df[col] = df[col].astype(str) df[formatted_col] = df[[col, formatted_col]].sum(axis=1) df[formatted_col] = [i.replace("nan", "") for i in df[formatted_col]] # df[formatted_col] = df[formatted_col].astype(float) else: df[col] = df[col].astype(float) df[formatted_col] = df[[formatted_col, col]].sum(axis=1) df.drop(double_bass_columns, axis=1, inplace=True) return df
[docs]def join_part_degrees( total_degrees: List[str], part: str, df: DataFrame, sufix: str = "" ) -> None: part_degrees = [i for i in total_degrees if part in i] aug = [i for i in part_degrees if "#" in i] desc = [i for i in part_degrees if "b" in i and not "bb" in i] d_desc = [i for i in part_degrees if "bb" in i] d_asc = [i for i in part_degrees if "x" in i] pattern = "^" + part + "Degree" + "[0-9].*" degree_nat = [x for x in part_degrees if re.search(pattern, x)] degree_nonat = [i for i in part_degrees if i not in degree_nat] df[part + DEGREE_PREFIX + "_Asc" + sufix] = df[aug].sum(axis=1) df[part + DEGREE_PREFIX + "_Desc" + sufix] = df[desc].sum(axis=1) df[part + DEGREE_PREFIX + "_Dasc" + sufix] = df[d_asc].sum(axis=1) df[part + DEGREE_PREFIX + "_Ddesc" + sufix] = df[d_desc].sum(axis=1) df[part + DEGREE_PREFIX + "_Nat" + sufix] = df[degree_nat].sum(axis=1) df[part + DEGREE_PREFIX + "_Nonat" + sufix] = df[degree_nonat].sum(axis=1)
[docs]def log_errors_and_shape( composer_counter: list, novoices_counter: list, df: DataFrame ) -> None: pinfo(f"\nTotal files skipped by composer: {len(composer_counter)}") pinfo(str(composer_counter)) pinfo(f"\nTotal files skipped by no-voices: { len(novoices_counter)}") pinfo(str(novoices_counter)) # pinfo(f"\nTotal files skipped by duetos/trietos: {len(duetos_counter)}") # pinfo(str(duetos_counter)) pinfo(f"\nFinal shape of the DataFrame: {df.shape[0]} rows, {df.shape[1]} features")
[docs]def delete_columns(data: DataFrame, config: dictConfig) -> None: pinfo("\nDeleting not useful columns...") for inst in config[INSTRUMENTS_TO_DELETE]: data.drop([i for i in data.columns if 'Part' + inst in i or inst+'_' in i], axis=1, inplace=True) for substring in config[SUBSTRING_TO_DELETE]: data.drop([i for i in data.columns if substring in i], axis=1, inplace=True) data.drop( [i for i in data.columns if i.endswith(tuple(config[ENDSWITH]))], axis=1, inplace=True, ) data.drop( [i for i in data.columns if i.startswith(tuple(config[STARTSWITH]))], axis=1, inplace=True, ) data.drop( [ col for col in data.columns if any(substring in col for substring in tuple(config["columns_contain"])) ], axis=1, inplace=True, ) presence = ["Presence_of_" + str(i) for i in config["delete_presence"]] if all(item in data.columns for item in presence): data.drop(presence, axis=1, inplace=True, errors="ignore") data.drop([i for i in data.columns if i.startswith('Sound') and not 'Voice' in i], axis=1, inplace=True) delete_exceptions(data)
[docs]def delete_exceptions(data) -> None: # Delete Vn when it is alone data.drop( data.columns[data.columns.str.contains(get_part_prefix("Vn"))], axis=1, inplace=True, ) if "PartVnI__PartVoice__" + TEXTURE in data: del data["PartVnI__PartVoice__Texture"] if (FAMILY_INSTRUMENTATION and FAMILY_SCORING) in data: data.drop([FAMILY_INSTRUMENTATION, FAMILY_SCORING], axis=1, inplace=True) # Remove empty voices empty_voices = [col for col in data.columns if col.startswith( tuple(voices_list_prefixes)) and all(data[col].isnull().values)] if empty_voices: data.drop(empty_voices, axis=1, inplace=True)
[docs]def split_passion_A(data: DataFrame) -> None: data["Label_PassionA_primary"] = data["Label_PassionA"].str.split(";", expand=True)[ 0 ] data["Label_PassionA_secundary"] = data["Label_PassionA"].str.split( ";", expand=True )[1] data["Label_PassionA_secundary"].fillna( data["Label_PassionA_primary"], inplace=True ) data.drop("Label_PassionA", axis=1, inplace=True)
[docs]def join_keys(df: DataFrame) -> None: key_SD = [ i for i in [ KEY_PREFIX + "IV" + KEY_PERCENTAGE, KEY_PREFIX + "II" + KEY_PERCENTAGE, KEY_PREFIX + "VI" + KEY_PERCENTAGE, ] if i in df ] key_sd = [ i for i in [ KEY_PREFIX + "iv" + KEY_PERCENTAGE, KEY_PREFIX + "ii" + KEY_PERCENTAGE, ] if i in df ] key_tonic = [ i for i in [KEY_PREFIX + "I" + KEY_PERCENTAGE, KEY_PREFIX + "i" + KEY_PERCENTAGE] if i in df ] key_rel = [ i for i in [ KEY_PREFIX + "III" + KEY_PERCENTAGE, KEY_PREFIX + "vi" + KEY_PERCENTAGE, ] if i in df ] total_key = key_rel + key_tonic + key_sd + key_SD others_key = [ i for i in df.columns if KEY_PREFIX in i and i not in total_key and KEY_MODULATORY not in i ] df[KEY_PREFIX + "SD" + KEY_PERCENTAGE] = df[key_SD].sum(axis=1) df[KEY_PREFIX + "sd" + KEY_PERCENTAGE] = df[key_sd].sum(axis=1) df[KEY_PREFIX + "SubD" + KEY_PERCENTAGE] = ( df[KEY_PREFIX + "sd" + KEY_PERCENTAGE] + df[KEY_PREFIX + "SD" + KEY_PERCENTAGE] ) df[KEY_PREFIX + "T" + KEY_PERCENTAGE] = df[key_tonic].sum(axis=1) df[KEY_PREFIX + "rel" + KEY_PERCENTAGE] = df[key_rel].sum(axis=1) df[KEY_PREFIX + "Other" + KEY_PERCENTAGE] = df[others_key].sum(axis=1)
# df.drop(total_key + others_key, axis = 1, inplace=True)
[docs]def join_keys_modulatory(df: DataFrame): key_SD = [ i for i in [ KEY_PREFIX + KEY_MODULATORY + "IV", KEY_PREFIX + KEY_MODULATORY + "II", KEY_PREFIX + KEY_MODULATORY + "VI", ] if i in df ] key_sd = [ i for i in [ KEY_PREFIX + KEY_MODULATORY + "iv", KEY_PREFIX + KEY_MODULATORY + "ii", ] if i in df ] key_tonic = [ i for i in [KEY_PREFIX + KEY_MODULATORY + "I", KEY_PREFIX + KEY_MODULATORY + "i"] if i in df ] key_rel = [ i for i in [ KEY_PREFIX + KEY_MODULATORY + "III", KEY_PREFIX + KEY_MODULATORY + "vi", ] if i in df ] total_key_mod = key_rel + key_tonic + key_sd + key_SD others_key_mod = [ i for i in df.columns if KEY_PREFIX + KEY_MODULATORY in i and i not in total_key_mod ] df[KEY_PREFIX + KEY_MODULATORY + "SD"] = df[key_SD].sum(axis=1) df[KEY_PREFIX + KEY_MODULATORY + "sd"] = df[key_sd].sum(axis=1) df[KEY_PREFIX + KEY_MODULATORY + "SubD"] = ( df[KEY_PREFIX + KEY_MODULATORY + "sd"] + df[KEY_PREFIX + KEY_MODULATORY + "SD"] ) df[KEY_PREFIX + KEY_MODULATORY + "T"] = df[key_tonic].sum(axis=1) df[KEY_PREFIX + KEY_MODULATORY + "rel"] = df[key_rel].sum(axis=1) df[KEY_PREFIX + KEY_MODULATORY + "Other"] = df[others_key_mod].sum(axis=1)
[docs]def merge_dataframes(name: str, dest_path: str) -> None: csv = ".csv" name1 = name + "_1" + csv name2 = name + "_2" + csv df1 = pd.read_csv(name1) df2 = pd.read_csv(name2) total_dataframe = pd.concat((df1, df2), axis=0) total_dataframe.to_csv(dest_path, index=False)