scientific-skills/dnanexus-integration/references/data-operations.md
DNAnexus provides comprehensive data management capabilities for files, records, databases, and other data objects. All data operations can be performed via the Python SDK (dxpy) or command-line interface (dx).
Binary or text data stored on the platform.
Structured data objects with arbitrary JSON details and metadata.
Structured database objects for relational data.
Executable programs (covered in app-development.md).
Multi-step analysis pipelines.
Open State: Data can be modified
Closed State: Data becomes immutable
Create (open) → Modify → Close (immutable)
Most objects start open and require explicit closure:
# Close a file
file_obj.close()
Some objects can be created and closed in one operation:
# Create closed record
record = dxpy.new_dxrecord(details={...}, close=True)
From local file:
import dxpy
# Upload a file
file_obj = dxpy.upload_local_file("data.txt", project="project-xxxx")
print(f"Uploaded: {file_obj.get_id()}")
With metadata:
file_obj = dxpy.upload_local_file(
"data.txt",
name="my_data",
project="project-xxxx",
folder="/results",
properties={"sample": "sample1", "type": "raw"},
tags=["experiment1", "batch2"]
)
Streaming upload:
# For large files or generated data
file_obj = dxpy.new_dxfile(project="project-xxxx", name="output.txt")
file_obj.write("Line 1\n")
file_obj.write("Line 2\n")
file_obj.close()
To local file:
# Download by ID
dxpy.download_dxfile("file-xxxx", "local_output.txt")
# Download using handler
file_obj = dxpy.DXFile("file-xxxx")
dxpy.download_dxfile(file_obj.get_id(), "local_output.txt")
Read file contents:
file_obj = dxpy.DXFile("file-xxxx")
with file_obj.open_file() as f:
contents = f.read()
Download to specific directory:
dxpy.download_dxfile("file-xxxx", "/path/to/directory/filename.txt")
Get file information:
file_obj = dxpy.DXFile("file-xxxx")
describe = file_obj.describe()
print(f"Name: {describe['name']}")
print(f"Size: {describe['size']} bytes")
print(f"State: {describe['state']}")
print(f"Created: {describe['created']}")
Update file metadata:
file_obj.set_properties({"experiment": "exp1", "version": "v2"})
file_obj.add_tags(["validated", "published"])
file_obj.rename("new_name.txt")
Records store structured metadata with arbitrary JSON.
# Create a record
record = dxpy.new_dxrecord(
name="sample_metadata",
types=["SampleMetadata"],
details={
"sample_id": "S001",
"tissue": "blood",
"age": 45,
"conditions": ["diabetes"]
},
project="project-xxxx",
close=True
)
record = dxpy.DXRecord("record-xxxx")
describe = record.describe()
# Access details
details = record.get_details()
sample_id = details["sample_id"]
tissue = details["tissue"]
# Record must be open to update
record = dxpy.DXRecord("record-xxxx")
details = record.get_details()
details["processed"] = True
record.set_details(details)
record.close()
Search by name:
results = dxpy.find_data_objects(
name="*.fastq",
project="project-xxxx",
folder="/raw_data"
)
for result in results:
print(f"{result['describe']['name']}: {result['id']}")
Search by properties:
results = dxpy.find_data_objects(
classname="file",
properties={"sample": "sample1", "type": "processed"},
project="project-xxxx"
)
Search by type:
# Find all records of specific type
results = dxpy.find_data_objects(
classname="record",
typename="SampleMetadata",
project="project-xxxx"
)
Search with state filter:
# Find only closed files
results = dxpy.find_data_objects(
classname="file",
state="closed",
project="project-xxxx"
)
# Search across all accessible projects
results = dxpy.find_data_objects(
name="important_data.txt",
describe=True # Include full descriptions
)
# Clone file to another project
new_file = dxpy.DXFile("file-xxxx").clone(
project="project-yyyy",
folder="/imported_data"
)
# Clone folder contents
files = dxpy.find_data_objects(
classname="file",
project="project-xxxx",
folder="/results"
)
for file in files:
file_obj = dxpy.DXFile(file['id'])
file_obj.clone(project="project-yyyy", folder="/backup")
# Create a new project
project = dxpy.api.project_new({
"name": "My Analysis Project",
"description": "RNA-seq analysis for experiment X"
})
project_id = project['id']
# Invite user to project
dxpy.api.project_invite(
project_id,
{
"invitee": "user-xxxx",
"level": "CONTRIBUTE" # VIEW, UPLOAD, CONTRIBUTE, ADMINISTER
}
)
# List accessible projects
projects = dxpy.find_projects(describe=True)
for proj in projects:
desc = proj['describe']
print(f"{desc['name']}: {proj['id']}")
# Create nested folders
dxpy.api.project_new_folder(
"project-xxxx",
{"folder": "/analysis/batch1/results", "parents": True}
)
# Move file to different folder
file_obj = dxpy.DXFile("file-xxxx", project="project-xxxx")
file_obj.move("/new_location")
# Remove file from project (not permanent deletion)
dxpy.api.project_remove_objects(
"project-xxxx",
{"objects": ["file-xxxx"]}
)
# Permanent deletion
file_obj = dxpy.DXFile("file-xxxx")
file_obj.remove()
Archived data is moved to cheaper long-term storage:
# Archive a file
dxpy.api.project_archive(
"project-xxxx",
{"files": ["file-xxxx"]}
)
# Unarchive when needed
dxpy.api.project_unarchive(
"project-xxxx",
{"files": ["file-xxxx"]}
)
import os
# Upload all files in directory
for filename in os.listdir("./data"):
filepath = os.path.join("./data", filename)
if os.path.isfile(filepath):
dxpy.upload_local_file(
filepath,
project="project-xxxx",
folder="/batch_upload"
)
# Download all files from folder
files = dxpy.find_data_objects(
classname="file",
project="project-xxxx",
folder="/results"
)
for file in files:
file_obj = dxpy.DXFile(file['id'])
filename = file_obj.describe()['name']
dxpy.download_dxfile(file['id'], f"./downloads/{filename}")