hed-standard · VisLab · Jun 26, 2026 · Jun 26, 2026 · Jun 26, 2026 · Jun 26, 2026
diff --git a/.github/copilot-instructions.md b/.github/copilot-instructions.md
@@ -7,7 +7,7 @@
 - Google-style docstrings; use `Parameters:` not `Args:`
 - Line length: 120 characters (configured in `pyproject.toml`)
 - Markdown headers use sentence case: capitalize only the first word (and proper nouns/acronyms)
-- When creating work summaries, place them in `.status/` at the repository root
+- **Temporary files** (test scripts, debug files, notes): Always create in `.status/` directory, never in the repository root
 
 ## Project overview
 

diff --git a/hed/schema/hed_cache.py b/hed/schema/hed_cache.py
@@ -130,9 +130,11 @@ def get_hed_versions(local_hed_directory=None, library_name=None, check_prerelea
             all_hed_versions[found_library_name].append(version)
     for name, hed_versions in all_hed_versions.items():
         all_hed_versions[name] = _sort_version_list(hed_versions)
+    if library_name == "all":
+        return all_hed_versions
     if library_name in all_hed_versions:
         return all_hed_versions[library_name]
-    return all_hed_versions
+    return []
 
 
 def get_hed_version_path(xml_version, library_name=None, local_hed_directory=None) -> Union[str, None]:

diff --git a/hed/schema/hed_schema.py b/hed/schema/hed_schema.py
@@ -406,9 +406,17 @@ def save_as_dataframes(self, base_filename, save_merged=False):
         Raises:
             OSError: File cannot be saved for some reason.
         """
+        from hed.schema.schema_io import df_constants
+
         output_dfs = Schema2DF().process_schema(self, save_merged)
-        if hasattr(self, "extras") and self.extras:
+        if hasattr(self, "extras") and self.extras and not save_merged:
+            # Only update with original extras if not saving merged
+            # When saving merged, the serializer's merged extras should be preserved
             output_dfs.update(self.extras)
+            # Strip in_library column from extras - it's internal metadata, never serialized
+            for key, df in output_dfs.items():
+                if key in df_constants.DF_EXTRAS and df_constants.in_library in df.columns:
+                    df.drop(columns=[df_constants.in_library], inplace=True)
         df_util.save_dataframes(base_filename, output_dfs)
 
     def set_schema_prefix(self, schema_namespace):

diff --git a/hed/schema/schema_io/base2schema.py b/hed/schema/schema_io/base2schema.py
@@ -249,6 +249,9 @@ def fix_extras(self):
 
         for key, extra in self._schema.extras.items():
             self._schema.extras[key] = extra.rename(columns=df_constants.EXTRAS_CONVERSIONS)
+            # Always strip in_library column (internal metadata, never serialized)
+            if df_constants.in_library in self._schema.extras[key].columns:
+                self._schema.extras[key] = self._schema.extras[key].drop(columns=[df_constants.in_library])
             if key in df_constants.extras_column_dict:
                 self._schema.extras[key] = self.fix_extra(key)
 
@@ -268,6 +271,8 @@ def fix_extra(self, key):
         if col_to_add:
             df[col_to_add] = ""
         other_cols = sorted(set(df.columns) - set(priority_cols))
+        # Strip in_library column (internal metadata, never serialized) and any unknown columns
+        other_cols = [col for col in other_cols if col != df_constants.in_library]
         df = df[priority_cols + other_cols]
         df = df.sort_values(by=list(df.columns))
         return df
diff --git a/hed/schema/schema_io/df2schema.py b/hed/schema/schema_io/df2schema.py
@@ -92,6 +92,9 @@ def _parse_data(self):
             )
         extras = {key: self.input_data[key] for key in constants.DF_EXTRAS if key in self.input_data}
         for key, _item in extras.items():
+            # Add in_library column if this is a library schema
+            if self.library and not extras[key].empty and constants.in_library not in extras[key].columns:
+                extras[key][constants.in_library] = self.library
             self._schema.extras[key] = df_util.merge_extras_dataframes(extras[key], self._schema.extras.get(key, None))
 
     def _get_prologue_epilogue(self, file_data):

diff --git a/hed/schema/schema_io/df_util.py b/hed/schema/schema_io/df_util.py
@@ -61,15 +61,17 @@ def merge_dataframes(df1, df2, key):
 def merge_extras_dataframes(library_df, standard_df):
     """Merge library and standard extras DataFrames by combining and deduplicating.
 
-    The library extras should contain all entries (standard + library-specific).
-    This function combines both and removes exact duplicates.
+    When the same entry appears in both library and standard schemas, the library version
+    (with in_library attribute) is retained. This ensures we can track which library
+    specific entries came from.
 
     Parameters:
         library_df (pd.DataFrame): DataFrame from library schema extras section
         standard_df (pd.DataFrame): DataFrame from standard schema extras section
 
     Returns:
-        pd.DataFrame: Combined DataFrame with duplicates removed and sorted
+        pd.DataFrame: Combined DataFrame with duplicates removed and sorted, with in_library
+                      preserved for duplicate entries
     """
     if standard_df is None or standard_df.empty:
         if library_df is None or library_df.empty:
@@ -78,10 +80,36 @@ def merge_extras_dataframes(library_df, standard_df):
     if library_df is None or library_df.empty:
         return standard_df.drop_duplicates().sort_values(by=list(standard_df.columns)).reset_index(drop=True)
 
-    combined = pd.concat([standard_df, library_df], ignore_index=True)
-    combined = combined.drop_duplicates()
-    combined = combined.sort_values(by=list(combined.columns)).reset_index(drop=True)
-    return combined
+    # Columns to compare for deduplication (exclude in_library)
+    compare_cols = [col for col in library_df.columns if col != constants.in_library]
+    if not compare_cols:
+        # If only in_library column, just return library_df
+        return library_df.drop_duplicates().sort_values(by=list(library_df.columns)).reset_index(drop=True)
+
+    # For each row in standard_df, check if it appears in library_df (by content, not by in_library)
+    # If it does, use the library version (which has in_library). If not, keep the standard version.
+    # Start with library_df
+    merged = library_df.copy()
+
+    # Deduplicate on compare_cols to avoid many-to-many join issues.
+    # When there are duplicate keys, merge() can produce more rows than either input,
+    # causing row indices to misalign. By deduplicating first on the key columns,
+    # we get unique combinations, find which are only in standard_df, then join back
+    # to standard_df to get the full rows.
+    standard_dedup = standard_df[compare_cols].drop_duplicates().reset_index(drop=True)
+    library_dedup = library_df[compare_cols].drop_duplicates().reset_index(drop=True)
+
+    # Use merge with indicator to find unique combinations only in standard_df
+    merge_result = standard_dedup.merge(library_dedup, on=compare_cols, how="left", indicator=True)
+    non_matching_dedup = merge_result[merge_result["_merge"] == "left_only"][compare_cols]
+
+    if len(non_matching_dedup) > 0:
+        # Join back to standard_df to get the full rows matching those unique combinations
+        to_add = standard_df.merge(non_matching_dedup, on=compare_cols, how="inner")
+        merged = pd.concat([merged, to_add], ignore_index=True)
+
+    merged = merged.drop_duplicates().sort_values(by=compare_cols).reset_index(drop=True)
+    return merged
 
 
 def merge_dataframe_dicts(df_dict1, df_dict2, key_column=constants.KEY_COLUMN_NAME):
@@ -110,15 +138,21 @@ def merge_dataframe_dicts(df_dict1, df_dict2, key_column=constants.KEY_COLUMN_NA
 
 
 def _merge_dataframes(df1, df2, key_column):
-    # Add columns from df2 that are not in df1, only for rows that are in df1
+    """Merge two DataFrames by adding columns from df2 to df1 based on key column matching.
 
+    Keeps all rows from df1 and adds columns from df2 where keys match.
+    This is used for enriching schema attribute DataFrames with additional column data.
+    """
     if df1.empty or df2.empty or key_column not in df1.columns or key_column not in df2.columns:
         raise HedFileError(
             HedExceptions.BAD_COLUMN_NAMES,
-            f"Both dataframes to be merged must be non-empty had nave a '{key_column}' column",
+            f"Both dataframes to be merged must be non-empty and have a '{key_column}' column",
             "",
         )
+
     df1 = df1.copy()
+
+    # Add columns from df2 that are not in df1
     for col in df2.columns:
         if col not in df1.columns and col != key_column:
             df1 = df1.merge(df2[[key_column, col]], on=key_column, how="left")

diff --git a/hed/schema/schema_io/json2schema.py b/hed/schema/schema_io/json2schema.py
@@ -512,6 +512,10 @@ def _load_extras(self):
         else:
             library_df = pd.DataFrame([], columns=df_constants.source_columns)
 
+        # Add in_library column if this is a library schema
+        if self.library and not library_df.empty:
+            library_df[df_constants.in_library] = self.library
+
         # Merge with existing schema extras if present (from withStandard base schema)
         standard_df = self._schema.extras.get(df_constants.SOURCES_KEY, None)
         self._schema.extras[df_constants.SOURCES_KEY] = df_util.merge_extras_dataframes(library_df, standard_df)
@@ -534,6 +538,10 @@ def _load_extras(self):
         else:
             library_df = pd.DataFrame([], columns=df_constants.prefix_columns)
 
+        # Add in_library column if this is a library schema
+        if self.library and not library_df.empty:
+            library_df[df_constants.in_library] = self.library
+
         # Merge with existing schema extras if present (from withStandard base schema)
         standard_df = self._schema.extras.get(df_constants.PREFIXES_KEY, None)
         self._schema.extras[df_constants.PREFIXES_KEY] = df_util.merge_extras_dataframes(library_df, standard_df)
@@ -557,6 +565,10 @@ def _load_extras(self):
         else:
             library_df = pd.DataFrame([], columns=df_constants.external_annotation_columns)
 
+        # Add in_library column if this is a library schema
+        if self.library and not library_df.empty:
+            library_df[df_constants.in_library] = self.library
+
         # Merge with existing schema extras if present (from withStandard base schema)
         standard_df = self._schema.extras.get(df_constants.EXTERNAL_ANNOTATION_KEY, None)
         self._schema.extras[df_constants.EXTERNAL_ANNOTATION_KEY] = df_util.merge_extras_dataframes(

diff --git a/hed/schema/schema_io/schema2base.py b/hed/schema/schema_io/schema2base.py
@@ -2,6 +2,8 @@
 
 from hed.schema.hed_schema_constants import HedSectionKey, HedKey
 from hed.errors.exceptions import HedFileError, HedExceptions
+from hed.schema.schema_io import df_util
+from hed.schema.schema_io import df_constants
 
 
 class Schema2Base:
@@ -45,6 +47,7 @@ def __init__(self):
         self._save_merged = False
         self._strip_out_in_library = False
         self._schema = None
+        self._base_schema = None  # Used when saving merged library schema to include base schema extras
 
     def process_schema(self, hed_schema, save_merged=False):
         """Convert a HedSchema object to the subclass's output format (mediawiki, XML, JSON, or TSV).
@@ -83,11 +86,22 @@ def process_schema(self, hed_schema, save_merged=False):
         self._save_base = False
         self._strip_out_in_library = True
         self._schema = hed_schema  # This is needed to save attributes in dataframes for now
+        self._base_schema = None  # Reset base schema reference
+
+        # If it is a library schema with a standard schema, we need to determine if we are saving merged or not.
         if hed_schema.with_standard:
             self._save_lib = True
             if save_merged:
                 self._save_base = True
                 self._strip_out_in_library = False
+                # Load base schema so extras can be merged when outputting.
+                # We intentionally do NOT wrap this in try/except. If the base schema cannot be loaded,
+                # we fail fast rather than silently producing an incomplete "merged" output
+                # (e.g., missing Sources/Prefixes/External annotations). This design prevents
+                # the subtle bug of producing partial output that looks valid but is missing data.
+                from hed.schema.hed_schema_io import load_schema_version
+
+                self._base_schema = load_schema_version(hed_schema.with_standard)
         else:
             # Saving a standard schema or a library schema without a standard schema
             save_merged = True
@@ -239,6 +253,37 @@ def _should_skip(self, entry):
     def _attribute_disallowed(self, attribute):
         return self._strip_out_in_library and attribute == HedKey.InLibrary
 
+    def _get_merged_extras(self, extras_key):
+        """Get extras, merging base schema extras when saving merged library schema.
+
+        Parameters:
+            extras_key (str): The key for the extras type (e.g., df_constants.SOURCES_KEY)
+
+        Returns:
+            pd.DataFrame or None: The extras dataframe, merged if applicable
+        """
+        lib_extras = self._schema.get_extras(extras_key)
+
+        # If not saving merged or no base schema, just return library extras
+        if not self._save_merged or not self._base_schema:
+            return lib_extras
+
+        # Merge base schema extras with library extras
+        base_extras = self._base_schema.get_extras(extras_key)
+        if base_extras is None and lib_extras is None:
+            return None
+
+        # Ensure both DataFrames have consistent columns, especially in_library
+        if lib_extras is not None and not lib_extras.empty and df_constants.in_library in lib_extras.columns:
+            # Library extras have in_library column - ensure base extras has it too
+            if base_extras is not None and not base_extras.empty and df_constants.in_library not in base_extras.columns:
+                base_extras = base_extras.copy()
+                base_extras[df_constants.in_library] = ""  # Mark base schema rows with empty string
+
+        # Use merge_extras_dataframes to combine them
+        merged = df_util.merge_extras_dataframes(lib_extras, base_extras)
+        return merged if not merged.empty else None
+
     def _format_tag_attributes(self, attributes):
         """Takes a dictionary of tag attributes and returns a string with the .mediawiki representation.
 

diff --git a/hed/schema/schema_io/schema2df.py b/hed/schema/schema_io/schema2df.py
@@ -90,8 +90,36 @@ def _output_extras(self, hed_schema):
             hed_schema(HedSchema): The HED schema to extract the information from
 
         """
+        # Import here to avoid circular imports
+        from hed.schema.schema_io import df_constants
+
+        # Get all extras keys that might exist
+        extras_keys = [df_constants.SOURCES_KEY, df_constants.PREFIXES_KEY, df_constants.EXTERNAL_ANNOTATION_KEY]
+
+        for key in extras_keys:
+            merged_extras = self._get_merged_extras(key)
+            if merged_extras is not None and not merged_extras.empty:
+                output_df = merged_extras.copy()
+                # Always strip in_library column - it's internal metadata, never serialized
+                if df_constants.in_library in output_df.columns:
+                    output_df = output_df.drop(columns=[df_constants.in_library])
+                self.output[key] = output_df
+            elif key in hed_schema.extras and hed_schema.extras[key] is not None:
+                # Include empty dataframes with proper structure
+                output_df = hed_schema.extras[key].copy()
+                # Always strip in_library column - it's internal metadata, never serialized
+                if df_constants.in_library in output_df.columns:
+                    output_df = output_df.drop(columns=[df_constants.in_library])
+                self.output[key] = output_df
+
+        # Also include any other extras that might exist
         for key, df in hed_schema.extras.items():
-            self.output[key] = df.copy()
+            if key not in self.output:
+                output_df = df.copy()
+                # Always strip in_library column - it's internal metadata, never serialized
+                if df_constants.in_library in output_df.columns:
+                    output_df = output_df.drop(columns=[df_constants.in_library])
+                self.output[key] = output_df
 
     def _output_epilogue(self, epilogue):
         base_object = "HedEpilogue"

diff --git a/hed/schema/schema_io/schema2json.py b/hed/schema/schema_io/schema2json.py
@@ -59,7 +59,7 @@ def _output_sources(self, hed_schema):
         Parameters:
             hed_schema (HedSchema): The schema being output
         """
-        sources = hed_schema.get_extras(df_constants.SOURCES_KEY)
+        sources = self._get_merged_extras(df_constants.SOURCES_KEY)
         if sources is None or sources.empty:
             return
 
@@ -80,7 +80,7 @@ def _output_prefixes(self, hed_schema):
         Parameters:
             hed_schema (HedSchema): The schema being output
         """
-        prefixes = hed_schema.get_extras(df_constants.PREFIXES_KEY)
+        prefixes = self._get_merged_extras(df_constants.PREFIXES_KEY)
         if prefixes is None or prefixes.empty:
             return
 
@@ -101,7 +101,7 @@ def _output_external_annotations(self, hed_schema):
         Parameters:
             hed_schema (HedSchema): The schema being output
         """
-        externals = hed_schema.get_extras(df_constants.EXTERNAL_ANNOTATION_KEY)
+        externals = self._get_merged_extras(df_constants.EXTERNAL_ANNOTATION_KEY)
         if externals is None or externals.empty:
             return
 

diff --git a/hed/schema/schema_io/schema2wiki.py b/hed/schema/schema_io/schema2wiki.py
@@ -61,7 +61,7 @@ def _output_extra(self, hed_schema, section_key, wiki_key):
             wiki_key (string): The key in the wiki constants for the section.
 
         """
-        extra = hed_schema.get_extras(section_key)
+        extra = self._get_merged_extras(section_key)
         if extra is None or extra.empty:
             return
 
@@ -73,6 +73,9 @@ def _output_extra(self, hed_schema, section_key, wiki_key):
             # Build column string from all columns
             column_strings = []
             for col in extra.columns:
+                # Always skip in_library column - it's internal metadata, never serialized
+                if col == df_constants.in_library:
+                    continue
                 if pd.notna(row[col]) and row[col] != "":
                     column_strings.append(f"{col}={row[col]}")
             self.current_tag_extra = ",".join(column_strings)

diff --git a/hed/schema/schema_io/schema2xml.py b/hed/schema/schema_io/schema2xml.py
@@ -45,7 +45,7 @@ def _output_extras(self, hed_schema):
         self._output_external_annotations(hed_schema)
 
     def _output_sources(self, hed_schema):
-        sources = hed_schema.get_extras(df_constants.SOURCES_KEY)
+        sources = self._get_merged_extras(df_constants.SOURCES_KEY)
         if sources is None or sources.empty:
             return
 
@@ -60,7 +60,7 @@ def _output_sources(self, hed_schema):
             description.text = row[df_constants.description]
 
     def _output_prefixes(self, hed_schema):
-        prefixes = hed_schema.get_extras(df_constants.PREFIXES_KEY)
+        prefixes = self._get_merged_extras(df_constants.PREFIXES_KEY)
         if prefixes is None or prefixes.empty:
             return
 
@@ -75,7 +75,7 @@ def _output_prefixes(self, hed_schema):
             prefix_description.text = row[df_constants.description]
 
     def _output_external_annotations(self, hed_schema):
-        externals = hed_schema.get_extras(df_constants.EXTERNAL_ANNOTATION_KEY)
+        externals = self._get_merged_extras(df_constants.EXTERNAL_ANNOTATION_KEY)
         if externals is None or externals.empty:
             return
 

diff --git a/hed/schema/schema_io/wiki2schema.py b/hed/schema/schema_io/wiki2schema.py
@@ -137,6 +137,10 @@ def _parse_extras(self, wiki_lines_by_section):
             stripped_key = extra_key.strip("'")
             stripped_key = WIKI_EXTRA_DICT.get(stripped_key, stripped_key)
 
+            # Add in_library column if this is a library schema
+            if self.library and not df.empty:
+                df[df_constants.in_library] = self.library
+
             # Merge with existing schema extras if present (from withStandard base schema)
             standard_df = self._schema.extras.get(stripped_key, None)
             self._schema.extras[stripped_key] = df_util.merge_extras_dataframes(df, standard_df)