跳转至

Local environment(本地环境)

Python

Dataset row count

How can I calculate in bulk the number of rows in many datasets?

This code uses the Foundry API to trigger row count computation for a list of dataset RIDs. It sends a POST request to the Foundry Stats API with the dataset RID and branch as parameters.

from shutil import ExecError
from wsgiref import headers
import requests
from urllib3 import Retry
import json
import pprint

'''
Script will trigger row count computation on the set of provided dataset rids
'''

# Base variables
base_url = "https://STACK_NAME.palantircloud.com"
branch = "master"

DATASETS_RIDS = [
    "ri.foundry.main.dataset.6d2cd3de-0052-xxxxx-c7ae2c4ab1d8"
]

headers = {
    'Authorization': 'Bearer eyg_PUT_YOUR_TOKEN_HERE_xxxx',
    'Content-Type': 'application/json'
}

# Proxies
proxyDict = {
    # "https": "https://proxyIfNeeded:port"
}

# Retries
retry = Retry(connect=1, backoff_factor=0.5)
adapter = requests.adapters.HTTPAdapter(max_retries=retry)
http = requests.Session()
http.mount("https://", adapter)

def trigger_row_count(dataset_rid, branch):
    response = http.post(f'{base_url}/foundry-stats/api/stats/datasets/{dataset_rid}/branches/{branch}', headers=headers,
                         proxies=proxyDict)
    raw_response = response.text
    curr_response = json.loads(raw_response)
    pprint.pprint(curr_response)

    return curr_response

for curr_dataset_rid in DATASETS_RIDS :
    trigger_row_count(curr_dataset_rid, branch)
  • Date submitted: 2024-03-26
  • Tags: export, python, metrics, metadata, local

Get superset of columns across datasets

How can I get the set of all columns across multiple datasets?

This code uses the requests library to fetch the schema of each dataset in a list of target datasets, and then iterates through the fields in the schema to create a dictionary containing the frequency of each column in the superset of columns.

import time

from requests.packages.urllib3.util.retry import Retry
from requests.adapters import HTTPAdapter
import requests
import json
import pprint
import logging
import datetime
import collections


'''
Script that generates the superset of columns with their frequency from a set of datasets
'''

headers = {
    'Authorization': 'Bearer eyg_PUT_YOUR_TOKEN_HERE_xxxx',
    'Content-Type': 'application/json',
}

## STACK_NAME
base_url = "STACK_NAME.palantircloud.com"
branch = "master"


target_datasets = ["ri.foundry.main.dataset.4c2ac089-xxxx-4df863eaf823"]

# Proxies
proxyDict = {
    #"https": "https://proxyIfNeeded:port"
}

# Retries
retry = Retry(connect=1, backoff_factor=0.5)
adapter = HTTPAdapter(max_retries=retry)
http = requests.Session()
http.mount("https://", adapter)


global_list_fields = {}

for curr_dataset in target_datasets :
    # Get schema of the dataset
    print(f"Step 1. Get Schema of dataset")
    response = http.get(f'{base_url}/foundry-metadata/api/schemas/datasets/{curr_dataset}/branches/{branch}', headers=headers, proxies=proxyDict)
    print(f"Step 1. Response of getting schema of dataset")
    raw_response = response.text
    print(raw_response)
    curr_schema = json.loads(raw_response)
    list_fields = curr_schema["schema"]["fieldSchemaList"]

    for field in list_fields:
        curr_key = f"{field['name']} - {field['type']}"
        # Increment counter
        global_list_fields[curr_key] = global_list_fields.get(curr_key, 0) + 1

print("Unsorted dict")
pprint.pprint(global_list_fields)

# Sort it
sorted_dict = {k: v for k, v in sorted(global_list_fields.items(), key=lambda item: item[1])}
print("Sorted dict")
pprint.pprint(sorted_dict)
  • Date submitted: 2024-03-26
  • Tags: python, API, metadata, code repositories, code authoring, local

Get the path for a given resource RID

How can I find the path of a resource from its RID?

This code uses the requests library to send an HTTP GET request to the specified host with the given RID, and retrieves the path of the resource. It also handles retries and proxies.

from requests.adapters import HTTPAdapter
import requests
from urllib3 import Retry

'''
Script to return the path of a given Resource IDentifier (RID).
'''

# Headers
headers = {
    'Authorization': 'Bearer xxx', # Replace 'xxx' with your bearer token
    'Content-Type': 'application/json',
}

# Host
host = 'host.com:443'

# Proxies
proxyDict = {
    'https': 'http://proxy.domain.com:3333'
}

# Retries
retry = Retry(connect=1, backoff_factor=0.5)
adapter = HTTPAdapter(max_retries=retry)
http = requests.Session()
http.mount('https://', adapter)

# Enter the rid of the resource you want the path of
RESOURCE_RID = ''

# Throw an error if the reference has failed to be created
try:
    print(f'Fetching path for rid {RESOURCE_RID} ...')
    response = http.get(f'https://{host}/compass/api/resources/{RESOURCE_RID}/path-json', headers=headers, proxies=proxyDict)
    print('Completed request')
    print(f'The path is: {response.text}')
except requests.exceptions.RequestException as e:
    raise Exception(f"An error occurred in the request.\nReturning the path for the repository: {RESOURCE_RID} failed due to: {response.status_code} - {response.text}\nException: {e}")
  • Date submitted: 2024-03-26
  • Tags: api, python, metadata, local

中文翻译


本地环境

Python

数据集行数统计

如何批量计算多个数据集的行数?

以下代码使用 Foundry API 触发指定数据集 RID 列表的行数计算。它向 Foundry Stats API 发送 POST 请求,并将数据集 RID 和分支作为参数传入。

from shutil import ExecError
from wsgiref import headers
import requests
from urllib3 import Retry
import json
import pprint

'''
该脚本将触发对指定数据集 RID 集合的行数计算
'''

# 基础变量
base_url = "https://STACK_NAME.palantircloud.com"
branch = "master"

DATASETS_RIDS = [
    "ri.foundry.main.dataset.6d2cd3de-0052-xxxxx-c7ae2c4ab1d8"
]

headers = {
    'Authorization': 'Bearer eyg_PUT_YOUR_TOKEN_HERE_xxxx',
    'Content-Type': 'application/json'
}

# 代理设置
proxyDict = {
    # "https": "https://proxyIfNeeded:port"
}

# 重试机制
retry = Retry(connect=1, backoff_factor=0.5)
adapter = requests.adapters.HTTPAdapter(max_retries=retry)
http = requests.Session()
http.mount("https://", adapter)

def trigger_row_count(dataset_rid, branch):
    response = http.post(f'{base_url}/foundry-stats/api/stats/datasets/{dataset_rid}/branches/{branch}', headers=headers,
                         proxies=proxyDict)
    raw_response = response.text
    curr_response = json.loads(raw_response)
    pprint.pprint(curr_response)

    return curr_response

for curr_dataset_rid in DATASETS_RIDS :
    trigger_row_count(curr_dataset_rid, branch)
  • 提交日期:2024-03-26
  • 标签:exportpythonmetricsmetadatalocal

获取数据集的列超集

如何获取多个数据集的全部列集合?

以下代码使用 requests 库获取目标数据集列表中每个数据集的 schema(模式),然后遍历 schema 中的字段,创建包含超集中每个列出现频率的字典。

import time

from requests.packages.urllib3.util.retry import Retry
from requests.adapters import HTTPAdapter
import requests
import json
import pprint
import logging
import datetime
import collections


'''
该脚本从一组数据集中生成包含列频率的超集
'''

headers = {
    'Authorization': 'Bearer eyg_PUT_YOUR_TOKEN_HERE_xxxx',
    'Content-Type': 'application/json',
}

## STACK_NAME
base_url = "STACK_NAME.palantircloud.com"
branch = "master"


target_datasets = ["ri.foundry.main.dataset.4c2ac089-xxxx-4df863eaf823"]

# 代理设置
proxyDict = {
    #"https": "https://proxyIfNeeded:port"
}

# 重试机制
retry = Retry(connect=1, backoff_factor=0.5)
adapter = HTTPAdapter(max_retries=retry)
http = requests.Session()
http.mount("https://", adapter)


global_list_fields = {}

for curr_dataset in target_datasets :
    # 获取数据集的 schema
    print(f"步骤 1. 获取数据集 schema")
    response = http.get(f'{base_url}/foundry-metadata/api/schemas/datasets/{curr_dataset}/branches/{branch}', headers=headers, proxies=proxyDict)
    print(f"步骤 1. 获取数据集 schema 的响应")
    raw_response = response.text
    print(raw_response)
    curr_schema = json.loads(raw_response)
    list_fields = curr_schema["schema"]["fieldSchemaList"]

    for field in list_fields:
        curr_key = f"{field['name']} - {field['type']}"
        # 递增计数器
        global_list_fields[curr_key] = global_list_fields.get(curr_key, 0) + 1

print("未排序字典")
pprint.pprint(global_list_fields)

# 排序
sorted_dict = {k: v for k, v in sorted(global_list_fields.items(), key=lambda item: item[1])}
print("排序后字典")
pprint.pprint(sorted_dict)
  • 提交日期:2024-03-26
  • 标签:pythonAPImetadatacode repositoriescode authoringlocal

根据资源 RID 获取路径

如何通过资源 RID 查找其路径?

以下代码使用 requests 库向指定主机发送 HTTP GET 请求,传入给定的 RID,并获取资源的路径。同时包含重试和代理处理。

from requests.adapters import HTTPAdapter
import requests
from urllib3 import Retry

'''
该脚本用于返回指定资源标识符(RID)的路径。
'''

# 请求头
headers = {
    'Authorization': 'Bearer xxx', # 将 'xxx' 替换为您的 bearer token
    'Content-Type': 'application/json',
}

# 主机
host = 'host.com:443'

# 代理设置
proxyDict = {
    'https': 'http://proxy.domain.com:3333'
}

# 重试机制
retry = Retry(connect=1, backoff_factor=0.5)
adapter = HTTPAdapter(max_retries=retry)
http = requests.Session()
http.mount('https://', adapter)

# 输入您想要查询路径的资源 RID
RESOURCE_RID = ''

# 如果引用创建失败则抛出错误
try:
    print(f'正在获取 rid {RESOURCE_RID} 的路径...')
    response = http.get(f'https://{host}/compass/api/resources/{RESOURCE_RID}/path-json', headers=headers, proxies=proxyDict)
    print('请求完成')
    print(f'路径为: {response.text}')
except requests.exceptions.RequestException as e:
    raise Exception(f"请求过程中发生错误。\n返回仓库 {RESOURCE_RID} 的路径失败,原因:{response.status_code} - {response.text}\n异常信息:{e}")
  • 提交日期:2024-03-26
  • 标签:apipythonmetadatalocal