Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/copilot-instructions.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
- Google-style docstrings; use `Parameters:` not `Args:`
- Line length: 120 characters (configured in `pyproject.toml`)
- Markdown headers use sentence case: capitalize only the first word (and proper nouns/acronyms)
- When creating work summaries, place them in `.status/` at the repository root
- **Temporary files** (test scripts, debug files, notes): Always create in `.status/` directory, never in the repository root

## Project overview

Expand Down
4 changes: 3 additions & 1 deletion hed/schema/hed_cache.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,9 +130,11 @@ def get_hed_versions(local_hed_directory=None, library_name=None, check_prerelea
all_hed_versions[found_library_name].append(version)
for name, hed_versions in all_hed_versions.items():
all_hed_versions[name] = _sort_version_list(hed_versions)
if library_name == "all":
return all_hed_versions
if library_name in all_hed_versions:
return all_hed_versions[library_name]
return all_hed_versions
return []


def get_hed_version_path(xml_version, library_name=None, local_hed_directory=None) -> Union[str, None]:
Expand Down
10 changes: 9 additions & 1 deletion hed/schema/hed_schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -406,9 +406,17 @@ def save_as_dataframes(self, base_filename, save_merged=False):
Raises:
OSError: File cannot be saved for some reason.
"""
from hed.schema.schema_io import df_constants

output_dfs = Schema2DF().process_schema(self, save_merged)
if hasattr(self, "extras") and self.extras:
if hasattr(self, "extras") and self.extras and not save_merged:
# Only update with original extras if not saving merged
# When saving merged, the serializer's merged extras should be preserved
output_dfs.update(self.extras)
# Strip in_library column from extras - it's internal metadata, never serialized
for key, df in output_dfs.items():
if key in df_constants.DF_EXTRAS and df_constants.in_library in df.columns:
df.drop(columns=[df_constants.in_library], inplace=True)
df_util.save_dataframes(base_filename, output_dfs)

def set_schema_prefix(self, schema_namespace):
Expand Down
5 changes: 5 additions & 0 deletions hed/schema/schema_io/base2schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -249,6 +249,9 @@ def fix_extras(self):

for key, extra in self._schema.extras.items():
self._schema.extras[key] = extra.rename(columns=df_constants.EXTRAS_CONVERSIONS)
# Always strip in_library column (internal metadata, never serialized)
if df_constants.in_library in self._schema.extras[key].columns:
self._schema.extras[key] = self._schema.extras[key].drop(columns=[df_constants.in_library])
if key in df_constants.extras_column_dict:
self._schema.extras[key] = self.fix_extra(key)

Expand All @@ -268,6 +271,8 @@ def fix_extra(self, key):
if col_to_add:
df[col_to_add] = ""
other_cols = sorted(set(df.columns) - set(priority_cols))
# Strip in_library column (internal metadata, never serialized) and any unknown columns
other_cols = [col for col in other_cols if col != df_constants.in_library]
df = df[priority_cols + other_cols]
df = df.sort_values(by=list(df.columns))
return df
3 changes: 3 additions & 0 deletions hed/schema/schema_io/df2schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,9 @@ def _parse_data(self):
)
extras = {key: self.input_data[key] for key in constants.DF_EXTRAS if key in self.input_data}
for key, _item in extras.items():
# Add in_library column if this is a library schema
if self.library and not extras[key].empty and constants.in_library not in extras[key].columns:
extras[key][constants.in_library] = self.library
self._schema.extras[key] = df_util.merge_extras_dataframes(extras[key], self._schema.extras.get(key, None))

def _get_prologue_epilogue(self, file_data):
Expand Down
52 changes: 43 additions & 9 deletions hed/schema/schema_io/df_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,15 +61,17 @@ def merge_dataframes(df1, df2, key):
def merge_extras_dataframes(library_df, standard_df):
"""Merge library and standard extras DataFrames by combining and deduplicating.

The library extras should contain all entries (standard + library-specific).
This function combines both and removes exact duplicates.
When the same entry appears in both library and standard schemas, the library version
(with in_library attribute) is retained. This ensures we can track which library
specific entries came from.

Parameters:
library_df (pd.DataFrame): DataFrame from library schema extras section
standard_df (pd.DataFrame): DataFrame from standard schema extras section

Returns:
pd.DataFrame: Combined DataFrame with duplicates removed and sorted
pd.DataFrame: Combined DataFrame with duplicates removed and sorted, with in_library
preserved for duplicate entries
"""
if standard_df is None or standard_df.empty:
if library_df is None or library_df.empty:
Expand All @@ -78,10 +80,36 @@ def merge_extras_dataframes(library_df, standard_df):
if library_df is None or library_df.empty:
return standard_df.drop_duplicates().sort_values(by=list(standard_df.columns)).reset_index(drop=True)

combined = pd.concat([standard_df, library_df], ignore_index=True)
combined = combined.drop_duplicates()
combined = combined.sort_values(by=list(combined.columns)).reset_index(drop=True)
return combined
# Columns to compare for deduplication (exclude in_library)
compare_cols = [col for col in library_df.columns if col != constants.in_library]
if not compare_cols:
# If only in_library column, just return library_df
return library_df.drop_duplicates().sort_values(by=list(library_df.columns)).reset_index(drop=True)

# For each row in standard_df, check if it appears in library_df (by content, not by in_library)
# If it does, use the library version (which has in_library). If not, keep the standard version.
# Start with library_df
merged = library_df.copy()

# Deduplicate on compare_cols to avoid many-to-many join issues.
# When there are duplicate keys, merge() can produce more rows than either input,
# causing row indices to misalign. By deduplicating first on the key columns,
# we get unique combinations, find which are only in standard_df, then join back
# to standard_df to get the full rows.
standard_dedup = standard_df[compare_cols].drop_duplicates().reset_index(drop=True)
library_dedup = library_df[compare_cols].drop_duplicates().reset_index(drop=True)

# Use merge with indicator to find unique combinations only in standard_df
merge_result = standard_dedup.merge(library_dedup, on=compare_cols, how="left", indicator=True)
non_matching_dedup = merge_result[merge_result["_merge"] == "left_only"][compare_cols]

if len(non_matching_dedup) > 0:
# Join back to standard_df to get the full rows matching those unique combinations
to_add = standard_df.merge(non_matching_dedup, on=compare_cols, how="inner")
merged = pd.concat([merged, to_add], ignore_index=True)

merged = merged.drop_duplicates().sort_values(by=compare_cols).reset_index(drop=True)
return merged


def merge_dataframe_dicts(df_dict1, df_dict2, key_column=constants.KEY_COLUMN_NAME):
Expand Down Expand Up @@ -110,15 +138,21 @@ def merge_dataframe_dicts(df_dict1, df_dict2, key_column=constants.KEY_COLUMN_NA


def _merge_dataframes(df1, df2, key_column):
# Add columns from df2 that are not in df1, only for rows that are in df1
"""Merge two DataFrames by adding columns from df2 to df1 based on key column matching.

Keeps all rows from df1 and adds columns from df2 where keys match.
This is used for enriching schema attribute DataFrames with additional column data.
"""
if df1.empty or df2.empty or key_column not in df1.columns or key_column not in df2.columns:
raise HedFileError(
HedExceptions.BAD_COLUMN_NAMES,
f"Both dataframes to be merged must be non-empty had nave a '{key_column}' column",
f"Both dataframes to be merged must be non-empty and have a '{key_column}' column",
"",
)

df1 = df1.copy()

# Add columns from df2 that are not in df1
for col in df2.columns:
if col not in df1.columns and col != key_column:
df1 = df1.merge(df2[[key_column, col]], on=key_column, how="left")
Expand Down
12 changes: 12 additions & 0 deletions hed/schema/schema_io/json2schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -512,6 +512,10 @@ def _load_extras(self):
else:
library_df = pd.DataFrame([], columns=df_constants.source_columns)

# Add in_library column if this is a library schema
if self.library and not library_df.empty:
library_df[df_constants.in_library] = self.library

# Merge with existing schema extras if present (from withStandard base schema)
standard_df = self._schema.extras.get(df_constants.SOURCES_KEY, None)
self._schema.extras[df_constants.SOURCES_KEY] = df_util.merge_extras_dataframes(library_df, standard_df)
Expand All @@ -534,6 +538,10 @@ def _load_extras(self):
else:
library_df = pd.DataFrame([], columns=df_constants.prefix_columns)

# Add in_library column if this is a library schema
if self.library and not library_df.empty:
library_df[df_constants.in_library] = self.library

# Merge with existing schema extras if present (from withStandard base schema)
standard_df = self._schema.extras.get(df_constants.PREFIXES_KEY, None)
self._schema.extras[df_constants.PREFIXES_KEY] = df_util.merge_extras_dataframes(library_df, standard_df)
Expand All @@ -557,6 +565,10 @@ def _load_extras(self):
else:
library_df = pd.DataFrame([], columns=df_constants.external_annotation_columns)

# Add in_library column if this is a library schema
if self.library and not library_df.empty:
library_df[df_constants.in_library] = self.library

# Merge with existing schema extras if present (from withStandard base schema)
standard_df = self._schema.extras.get(df_constants.EXTERNAL_ANNOTATION_KEY, None)
self._schema.extras[df_constants.EXTERNAL_ANNOTATION_KEY] = df_util.merge_extras_dataframes(
Expand Down
45 changes: 45 additions & 0 deletions hed/schema/schema_io/schema2base.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@

from hed.schema.hed_schema_constants import HedSectionKey, HedKey
from hed.errors.exceptions import HedFileError, HedExceptions
from hed.schema.schema_io import df_util
from hed.schema.schema_io import df_constants


class Schema2Base:
Expand Down Expand Up @@ -45,6 +47,7 @@ def __init__(self):
self._save_merged = False
self._strip_out_in_library = False
self._schema = None
self._base_schema = None # Used when saving merged library schema to include base schema extras

def process_schema(self, hed_schema, save_merged=False):
"""Convert a HedSchema object to the subclass's output format (mediawiki, XML, JSON, or TSV).
Expand Down Expand Up @@ -83,11 +86,22 @@ def process_schema(self, hed_schema, save_merged=False):
self._save_base = False
self._strip_out_in_library = True
self._schema = hed_schema # This is needed to save attributes in dataframes for now
self._base_schema = None # Reset base schema reference

# If it is a library schema with a standard schema, we need to determine if we are saving merged or not.
if hed_schema.with_standard:
self._save_lib = True
if save_merged:
self._save_base = True
self._strip_out_in_library = False
# Load base schema so extras can be merged when outputting.
# We intentionally do NOT wrap this in try/except. If the base schema cannot be loaded,
# we fail fast rather than silently producing an incomplete "merged" output
# (e.g., missing Sources/Prefixes/External annotations). This design prevents
# the subtle bug of producing partial output that looks valid but is missing data.
from hed.schema.hed_schema_io import load_schema_version

self._base_schema = load_schema_version(hed_schema.with_standard)
else:
# Saving a standard schema or a library schema without a standard schema
save_merged = True
Expand Down Expand Up @@ -239,6 +253,37 @@ def _should_skip(self, entry):
def _attribute_disallowed(self, attribute):
return self._strip_out_in_library and attribute == HedKey.InLibrary

def _get_merged_extras(self, extras_key):
"""Get extras, merging base schema extras when saving merged library schema.

Parameters:
extras_key (str): The key for the extras type (e.g., df_constants.SOURCES_KEY)

Returns:
pd.DataFrame or None: The extras dataframe, merged if applicable
"""
lib_extras = self._schema.get_extras(extras_key)

# If not saving merged or no base schema, just return library extras
if not self._save_merged or not self._base_schema:
return lib_extras

# Merge base schema extras with library extras
base_extras = self._base_schema.get_extras(extras_key)
if base_extras is None and lib_extras is None:
return None

# Ensure both DataFrames have consistent columns, especially in_library
if lib_extras is not None and not lib_extras.empty and df_constants.in_library in lib_extras.columns:
# Library extras have in_library column - ensure base extras has it too
if base_extras is not None and not base_extras.empty and df_constants.in_library not in base_extras.columns:
base_extras = base_extras.copy()
base_extras[df_constants.in_library] = "" # Mark base schema rows with empty string

# Use merge_extras_dataframes to combine them
merged = df_util.merge_extras_dataframes(lib_extras, base_extras)
return merged if not merged.empty else None

def _format_tag_attributes(self, attributes):
"""Takes a dictionary of tag attributes and returns a string with the .mediawiki representation.

Expand Down
30 changes: 29 additions & 1 deletion hed/schema/schema_io/schema2df.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,8 +90,36 @@ def _output_extras(self, hed_schema):
hed_schema(HedSchema): The HED schema to extract the information from

"""
# Import here to avoid circular imports
from hed.schema.schema_io import df_constants

# Get all extras keys that might exist
extras_keys = [df_constants.SOURCES_KEY, df_constants.PREFIXES_KEY, df_constants.EXTERNAL_ANNOTATION_KEY]

for key in extras_keys:
merged_extras = self._get_merged_extras(key)
if merged_extras is not None and not merged_extras.empty:
output_df = merged_extras.copy()
# Always strip in_library column - it's internal metadata, never serialized
if df_constants.in_library in output_df.columns:
output_df = output_df.drop(columns=[df_constants.in_library])
self.output[key] = output_df
elif key in hed_schema.extras and hed_schema.extras[key] is not None:
# Include empty dataframes with proper structure
output_df = hed_schema.extras[key].copy()
# Always strip in_library column - it's internal metadata, never serialized
if df_constants.in_library in output_df.columns:
output_df = output_df.drop(columns=[df_constants.in_library])
self.output[key] = output_df

# Also include any other extras that might exist
for key, df in hed_schema.extras.items():
self.output[key] = df.copy()
if key not in self.output:
output_df = df.copy()
# Always strip in_library column - it's internal metadata, never serialized
if df_constants.in_library in output_df.columns:
output_df = output_df.drop(columns=[df_constants.in_library])
self.output[key] = output_df

def _output_epilogue(self, epilogue):
base_object = "HedEpilogue"
Expand Down
6 changes: 3 additions & 3 deletions hed/schema/schema_io/schema2json.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ def _output_sources(self, hed_schema):
Parameters:
hed_schema (HedSchema): The schema being output
"""
sources = hed_schema.get_extras(df_constants.SOURCES_KEY)
sources = self._get_merged_extras(df_constants.SOURCES_KEY)
if sources is None or sources.empty:
return

Expand All @@ -80,7 +80,7 @@ def _output_prefixes(self, hed_schema):
Parameters:
hed_schema (HedSchema): The schema being output
"""
prefixes = hed_schema.get_extras(df_constants.PREFIXES_KEY)
prefixes = self._get_merged_extras(df_constants.PREFIXES_KEY)
if prefixes is None or prefixes.empty:
return

Expand All @@ -101,7 +101,7 @@ def _output_external_annotations(self, hed_schema):
Parameters:
hed_schema (HedSchema): The schema being output
"""
externals = hed_schema.get_extras(df_constants.EXTERNAL_ANNOTATION_KEY)
externals = self._get_merged_extras(df_constants.EXTERNAL_ANNOTATION_KEY)
if externals is None or externals.empty:
return

Expand Down
5 changes: 4 additions & 1 deletion hed/schema/schema_io/schema2wiki.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ def _output_extra(self, hed_schema, section_key, wiki_key):
wiki_key (string): The key in the wiki constants for the section.

"""
extra = hed_schema.get_extras(section_key)
extra = self._get_merged_extras(section_key)
if extra is None or extra.empty:
return

Expand All @@ -73,6 +73,9 @@ def _output_extra(self, hed_schema, section_key, wiki_key):
# Build column string from all columns
column_strings = []
for col in extra.columns:
# Always skip in_library column - it's internal metadata, never serialized
if col == df_constants.in_library:
continue
if pd.notna(row[col]) and row[col] != "":
column_strings.append(f"{col}={row[col]}")
self.current_tag_extra = ",".join(column_strings)
Expand Down
6 changes: 3 additions & 3 deletions hed/schema/schema_io/schema2xml.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ def _output_extras(self, hed_schema):
self._output_external_annotations(hed_schema)

def _output_sources(self, hed_schema):
sources = hed_schema.get_extras(df_constants.SOURCES_KEY)
sources = self._get_merged_extras(df_constants.SOURCES_KEY)
if sources is None or sources.empty:
return

Expand All @@ -60,7 +60,7 @@ def _output_sources(self, hed_schema):
description.text = row[df_constants.description]

def _output_prefixes(self, hed_schema):
prefixes = hed_schema.get_extras(df_constants.PREFIXES_KEY)
prefixes = self._get_merged_extras(df_constants.PREFIXES_KEY)
if prefixes is None or prefixes.empty:
return

Expand All @@ -75,7 +75,7 @@ def _output_prefixes(self, hed_schema):
prefix_description.text = row[df_constants.description]

def _output_external_annotations(self, hed_schema):
externals = hed_schema.get_extras(df_constants.EXTERNAL_ANNOTATION_KEY)
externals = self._get_merged_extras(df_constants.EXTERNAL_ANNOTATION_KEY)
if externals is None or externals.empty:
return

Expand Down
4 changes: 4 additions & 0 deletions hed/schema/schema_io/wiki2schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -137,6 +137,10 @@ def _parse_extras(self, wiki_lines_by_section):
stripped_key = extra_key.strip("'")
stripped_key = WIKI_EXTRA_DICT.get(stripped_key, stripped_key)

# Add in_library column if this is a library schema
if self.library and not df.empty:
df[df_constants.in_library] = self.library

# Merge with existing schema extras if present (from withStandard base schema)
standard_df = self._schema.extras.get(stripped_key, None)
self._schema.extras[stripped_key] = df_util.merge_extras_dataframes(df, standard_df)
Expand Down
Loading
Loading