Skip to main content

Glossary analysis and export

Use Lakehouse to analyze business glossaries, including terms, categories, documentation, and how terms are assigned across data assets. These queries help you export glossary metadata for documentation, compliance reporting, and stewardship review.

Before you begin

Before you run these queries, make sure:

You can run these queries on the GLOSSARY_DETAILS, README, and ASSETS tables in your Lakehouse to analyze glossary structure and usage across your data platform.

Retrieve glossary terms

Use this query to retrieve glossary metadata that includes parent glossary information, categories, terms. It's optimized for glossary documentation and governance reporting where you need a comprehensive view of terms, their organization, and basic usage statistics.

info
  • Update the WHERE clauses in the CTEs below to filter glossary terms that match your use case.
-- ==============================================================================
-- PURPOSE: Retrieve comprehensive glossary term metadata with parent glossary,
-- associated categories, readme description, and assigned entity details
-- ==============================================================================

-- CTE 1: Extract all active glossary terms with their metadata
WITH glossary_terms AS (
SELECT
-- Core term identifiers
t.GUID AS term_guid,
t.NAME AS term_name,
t.QUALIFIED_NAME AS term_qualified_name,
t.DESCRIPTION AS term_description,

-- Status and certification
t.STATUS AS term_status,
t.CERTIFICATE_STATUS AS term_certificate_status,

-- Ownership
t.OWNER_USERS AS term_owner_users,

-- Audit timestamps (convert from epoch milliseconds to timestamp)
TO_TIMESTAMP_LTZ(t.CREATED_TIME / 1000) AS term_created_at,
TO_TIMESTAMP_LTZ(t.UPDATED_TIME / 1000) AS term_updated_at,
t.CREATED_BY AS term_created_by,
t.UPDATED_BY AS term_updated_by,

-- Relationship fields
t.ANCHOR_GUID AS glossary_guid, -- Parent glossary GUID (single value)
t.CATEGORIES AS term_categories, -- Array of category GUIDs (0 to many)
t.README_GUID AS term_readme_guid, -- Readme GUID
t.ASSIGNED_ENTITIES AS term_assigned_entities -- Entities linked to this term

FROM GLOSSARY_DETAILS t
WHERE t.ASSET_TYPE = 'AtlasGlossaryTerm' -- Filter to terms only
AND t.STATUS = 'ACTIVE' -- Exclude archived/deleted terms
),

-- CTE 2: Extract all active glossaries for joining to terms
glossary_info AS (
SELECT
g.GUID AS glossary_guid,
g.NAME AS glossary_name,
g.QUALIFIED_NAME AS glossary_qualified_name,
g.DESCRIPTION AS glossary_description,
g.CERTIFICATE_STATUS AS glossary_certificate_status,
g.OWNER_USERS AS glossary_owner_users
FROM GLOSSARY_DETAILS g
WHERE g.ASSET_TYPE = 'AtlasGlossary' -- Filter to glossaries only
AND g.STATUS = 'ACTIVE' -- Exclude archived/deleted glossaries
),

-- CTE 3: Extract all active categories for joining to terms
category_info AS (
SELECT
c.GUID AS category_guid,
c.NAME AS category_name,
c.QUALIFIED_NAME AS category_qualified_name,
c.DESCRIPTION AS category_description
FROM GLOSSARY_DETAILS c
WHERE c.ASSET_TYPE = 'AtlasGlossaryCategory' -- Filter to categories only
AND c.STATUS = 'ACTIVE' -- Exclude archived/deleted categories
),

-- CTE 4: Get readme descriptions
readme_info AS (
SELECT
r.GUID AS readme_guid,
r.ASSET_NAME AS readme_name,
r.DESCRIPTION AS readme_description
FROM README r
WHERE r.STATUS = 'ACTIVE'
),

-- CTE 5: Flatten the term_categories array to enable joining with category details
-- This creates one row per term-category relationship
term_categories_expanded AS (
SELECT
t.term_guid,
cat_flat.VALUE::STRING AS category_guid -- Extract each category GUID from array
FROM glossary_terms t
, TABLE(FLATTEN(input => t.term_categories)) AS cat_flat -- Snowflake FLATTEN function
WHERE t.term_categories IS NOT NULL -- Only process terms with categories
),

-- CTE 6: Flatten assigned_entities and get asset details
term_assigned_assets AS (
SELECT
t.term_guid,
asset_flat.VALUE::STRING AS asset_guid
FROM glossary_terms t
, TABLE(FLATTEN(input => t.term_assigned_entities)) AS asset_flat
WHERE t.term_assigned_entities IS NOT NULL
),

-- CTE 7: Get asset names and qualified names
asset_details AS (
SELECT
taa.term_guid,
a.GUID AS asset_guid,
a.ASSET_NAME,
a.ASSET_QUALIFIED_NAME,
a.ASSET_TYPE
FROM term_assigned_assets taa
INNER JOIN ASSETS a ON a.GUID = taa.asset_guid
WHERE a.STATUS = 'ACTIVE'
)

-- Final query: Join all pieces together and aggregate categories per term
SELECT
-- ====================
-- TERM DETAILS
-- ====================
t.term_guid,
t.term_name,
t.term_qualified_name,
t.term_description,
t.term_status,
t.term_certificate_status,
t.term_owner_users,
t.term_created_at,
t.term_updated_at,
t.term_created_by,
t.term_updated_by,

-- Calculate days since last update
DATEDIFF(day, t.term_updated_at, CURRENT_TIMESTAMP()) AS days_since_update,

-- ====================
-- GLOSSARY DETAILS (1:1 relationship - each term has exactly one glossary)
-- ====================
g.glossary_guid,
g.glossary_name,
g.glossary_qualified_name,
g.glossary_description,
g.glossary_certificate_status,
g.glossary_owner_users,

-- ====================
-- README DETAILS
-- ====================
t.term_readme_guid,
r.readme_description AS term_readme_description,

-- ====================
-- CATEGORY DETAILS (1:Many relationship - aggregated into arrays)
-- Returns NULL arrays for terms without categories
-- ====================
ARRAY_AGG(DISTINCT c.category_guid) AS category_guids,
ARRAY_AGG(DISTINCT c.category_name) AS category_names,
ARRAY_AGG(DISTINCT c.category_qualified_name) AS category_qualified_names,
COUNT(DISTINCT c.category_guid) AS category_count, -- Number of categories this term belongs to

-- ====================
-- ASSIGNED ASSET DETAILS (aggregated from ASSETS table)
-- ====================
ARRAY_AGG(DISTINCT ad.asset_guid) AS assigned_asset_guids,
ARRAY_AGG(DISTINCT ad.ASSET_NAME) AS assigned_asset_names,
ARRAY_AGG(DISTINCT ad.ASSET_QUALIFIED_NAME) AS assigned_asset_qualified_names,
ARRAY_AGG(DISTINCT ad.ASSET_TYPE) AS assigned_asset_types,
COUNT(DISTINCT ad.asset_guid) AS assigned_asset_count

FROM glossary_terms t

-- Join to glossary (LEFT JOIN to handle edge cases, though all terms should have a glossary)
LEFT JOIN glossary_info g
ON g.glossary_guid = t.glossary_guid

-- Join to readme information
LEFT JOIN readme_info r
ON r.readme_guid = t.term_readme_guid

-- Join to expanded categories (LEFT JOIN because terms may have zero categories)
LEFT JOIN term_categories_expanded tce
ON tce.term_guid = t.term_guid

-- Join to category details to get names and descriptions
LEFT JOIN category_info c
ON c.category_guid = tce.category_guid

-- Join to asset details
LEFT JOIN asset_details ad
ON ad.term_guid = t.term_guid

-- Group by all non-aggregated columns to consolidate multiple category rows per term
GROUP BY
t.term_guid,
t.term_name,
t.term_qualified_name,
t.term_description,
t.term_status,
t.term_certificate_status,
t.term_owner_users,
t.term_created_at,
t.term_updated_at,
t.term_created_by,
t.term_updated_by,
g.glossary_guid,
g.glossary_name,
g.glossary_qualified_name,
g.glossary_description,
g.glossary_certificate_status,
g.glossary_owner_users,
t.term_readme_guid,
r.readme_description

-- Sort results by glossary name, then term name for easy navigation
ORDER BY g.glossary_name, t.term_name;
Sample output
TERM_NAMETERM_DESCRIPTIONTERM_STATUSTERM_CERTIFICATE_STATUSGLOSSARY_NAMECATEGORY_NAMESASSIGNED_ASSET_COUNT
Active FanThis is a Test during the F5 DemoACTIVEDRAFTBusiness LinesMarketing0
Annual RevenueAnnual Revenue refers to the total income that a company generates from its business activities in a single fiscal yearACTIVEVERIFIEDBusiness LinesFinance12
Annual RevenuesACTIVEVERIFIEDBusiness LinesFinance0
Application Uptime And AvailabilityProvides a measure of the time an application is fully operational and accessible to usersACTIVEDRAFTBusiness LinesPerformance Metrics0
Areal DensityACTIVEDRAFTBusiness LinesStorage Drives1