Complete Guide: Creating Production-Grade Genomics Datasets for AI/ML
Creating genomics datasets for machine learning is a complex, multi-step process that requires understanding different data types, preprocessing techniques, and quality control measures. Let me guide you from basics to production-grade implementation.
1. Understanding Genomics Data Types
A. Sequence Data
# Raw DNA sequences (FASTA format)
>chr1:1000-2000
ATGCGATCGATCGATCGATCGATCGATCGATC...
# Quality scores (FASTQ format)
@read1
ATGCGATCGATCGATCGATC
+
!"#$%&'()*+,-./0123
B. Reference Genome Data
- FASTA files: Raw genomic sequences
- Annotations (GTF/GFF): Gene locations, exons, introns
- Variants (VCF): Single nucleotide polymorphisms (SNPs), insertions/deletions
C. Functional Genomics Data
- ChIP-seq: Protein-DNA binding sites
- ATAC-seq: Chromatin accessibility
- RNA-seq: Gene expression levels
- Hi-C: 3D chromosome structure
D. Clinical/Phenotype Data
- Disease status: Case/control labels
- Drug responses: Pharmacogenomics
- Quantitative traits: Height, weight, biomarkers
2. Basic Data Processing Pipeline
Step 1: Data Acquisition
import pandas as pd
import numpy as np
from Bio import SeqIO
from pysam import VariantFile
import h5py
class GenomicsDataCollector:
def __init__(self, data_directory):
self.data_dir = data_directory
self.sequences = {}
self.annotations = {}
self.variants = {}
def load_reference_genome(self, fasta_path):
"""Load reference genome sequences"""
sequences = {}
for record in SeqIO.parse(fasta_path, "fasta"):
sequences[record.id] = str(record.seq)
return sequences
def load_variants(self, vcf_path):
"""Load genetic variants"""
variants = []
vcf_in = VariantFile(vcf_path)
for record in vcf_in.fetch():
variant_data = {
'chromosome': record.chrom,
'position': record.pos,
'ref_allele': record.ref,
'alt_alleles': [str(alt) for alt in record.alts],
'quality': record.qual,
'genotypes': [sample['GT'] for sample in record.samples.values()]
}
variants.append(variant_data)
return pd.DataFrame(variants)
def load_expression_data(self, counts_file):
"""Load RNA-seq expression data"""
expression_df = pd.read_csv(counts_file, sep='\t', index_col=0)
return expression_df
Step 2: Sequence Encoding
class SequenceEncoder:
"""Convert DNA sequences to numerical representations"""
def __init__(self, encoding_type='one_hot'):
self.encoding_type = encoding_type
self.nucleotide_map = {'A': 0, 'T': 1, 'G': 2, 'C': 3, 'N': 4}
def one_hot_encode(self, sequence):
"""One-hot encoding: A=[1,0,0,0], T=[0,1,0,0], G=[0,0,1,0], C=[0,0,0,1]"""
encoded = np.zeros((len(sequence), 5)) # 5th dimension for N (unknown)
for i, nucleotide in enumerate(sequence.upper()):
if nucleotide in self.nucleotide_map:
encoded[i, self.nucleotide_map[nucleotide]] = 1
return encoded[:, :4] # Return only A,T,G,C dimensions
def integer_encode(self, sequence):
"""Integer encoding: A=0, T=1, G=2, C=3"""
return np.array([self.nucleotide_map.get(nuc.upper(), 4) for nuc in sequence])
def kmer_encode(self, sequence, k=3):
"""K-mer frequency encoding"""
from collections import Counter
kmers = [sequence[i:i+k] for i in range(len(sequence)-k+1)]
kmer_counts = Counter(kmers)
return kmer_counts
# Example usage
encoder = SequenceEncoder()
dna_seq = "ATGCGATCGATC"
one_hot = encoder.one_hot_encode(dna_seq)
print(f"One-hot shape: {one_hot.shape}")
3. Functional Genomics Data Processing
ChIP-seq Peak Data
class ChIPseqProcessor:
def __init__(self):
self.peak_data = None
def load_peaks(self, bed_file):
"""Load ChIP-seq peaks from BED format"""
peaks = pd.read_csv(bed_file, sep='\t', header=None,
names=['chromosome', 'start', 'end', 'name', 'score'])
return peaks
def create_peak_matrix(self, peaks, genome_windows, window_size=1000):
"""Create binary matrix: 1 if window overlaps peak, 0 otherwise"""
peak_matrix = np.zeros((len(genome_windows), 1))
for i, window in enumerate(genome_windows):
chrom, start, end = window
# Check if any peak overlaps this window
overlapping_peaks = peaks[
(peaks['chromosome'] == chrom) &
(peaks['start'] start)
]
peak_matrix[i, 0] = 1 if len(overlapping_peaks) > 0 else 0
return peak_matrix
class ATACseqProcessor:
"""Process ATAC-seq accessibility data"""
def load_accessibility_scores(self, bigwig_file):
"""Load accessibility scores from BigWig format"""
import pyBigWig
bw = pyBigWig.open(bigwig_file)
accessibility_data = {}
for chrom in bw.chroms():
accessibility_data[chrom] = bw.values(chrom, 0, bw.chroms()[chrom])
return accessibility_data
RNA-seq Expression Processing
class RNAseqProcessor:
def __init__(self):
self.expression_data = None
def load_counts(self, counts_file):
"""Load raw counts matrix"""
counts_df = pd.read_csv(counts_file, sep='\t', index_col=0)
return counts_df
def normalize_counts(self, counts_df, method='tpm'):
"""Normalize expression counts"""
if method == 'tpm':
# Transcripts Per Million normalization
gene_lengths = self.get_gene_lengths() # Would need gene annotation
rpk = counts_df.div(gene_lengths, axis=0) / 1000
tpm = rpk.div(rpk.sum(axis=0), axis=1) * 1e6
return tpm
elif method == 'log_cpm':
# Log Counts Per Million
cpm = counts_df.div(counts_df.sum(axis=0), axis=1) * 1e6
log_cpm = np.log2(cpm + 1)
return log_cpm
def filter_low_expression(self, counts_df, min_counts=10, min_samples=3):
"""Filter out lowly expressed genes"""
expressed_genes = (counts_df >= min_counts).sum(axis=1) >= min_samples
return counts_df[expressed_genes]
4. Variant Data Processing
class VariantProcessor:
def __init__(self):
self.variants = None
def load_vcf(self, vcf_path):
"""Load and process VCF file"""
variants = []
vcf_in = VariantFile(vcf_path)
for record in vcf_in.fetch():
for sample in record.samples:
gt = record.samples[sample]['GT']
variant_entry = {
'sample_id': sample,
'chromosome': record.chrom,
'position': record.pos,
'ref': record.ref,
'alt': str(record.alts[0]) if record.alts else '.',
'genotype': f"{gt}/{gt[12]}" if gt != (None, None) else "./.",
'quality': record.qual
}
variants.append(variant_entry)
return pd.DataFrame(variants)
def create_genotype_matrix(self, variants_df):
"""Create sample x variant genotype matrix"""
# Pivot to create matrix: rows=samples, columns=variants
genotype_matrix = variants_df.pivot_table(
index='sample_id',
columns=['chromosome', 'position'],
values='genotype',
aggfunc='first'
)
# Encode genotypes: 0/0=0, 0/1=1, 1/1=2
def encode_genotype(gt):
if pd.isna(gt) or gt == './.':
return -1 # Missing
elif gt == '0/0':
return 0 # Homozygous reference
elif gt in ['0/1', '1/0']:
return 1 # Heterozygous
elif gt == '1/1':
return 2 # Homozygous alternative
else:
return -1 # Other/missing
encoded_matrix = genotype_matrix.applymap(encode_genotype)
return encoded_matrix
5. Multi-Modal Data Integration
class GenomicsDataIntegrator:
def __init__(self):
self.integrated_data = {}
def create_genomic_windows(self, chromosome_lengths, window_size=1000, step_size=500):
"""Create sliding windows across genome"""
windows = []
for chrom, length in chromosome_lengths.items():
for start in range(0, length - window_size + 1, step_size):
end = start + window_size
windows.append((chrom, start, end))
return windows
def integrate_features(self, windows, sequence_data, chip_data, atac_data,
expression_data, variant_data):
"""Integrate all data types for each genomic window"""
integrated_features = []
for window in windows:
chrom, start, end = window
# Extract sequence features
sequence = sequence_data[chrom][start:end]
sequence_features = SequenceEncoder().one_hot_encode(sequence)
# Extract ChIP-seq features
chip_features = self.extract_window_features(chip_data, window)
# Extract ATAC-seq features
atac_features = self.extract_window_features(atac_data, window)
# Extract expression features (genes overlapping window)
expression_features = self.get_expression_in_window(expression_data, window)
# Extract variant features
variant_features = self.get_variants_in_window(variant_data, window)
# Combine all features
combined_features = {
'window_id': f"{chrom}:{start}-{end}",
'sequence': sequence_features,
'chip_seq': chip_features,
'atac_seq': atac_features,
'expression': expression_features,
'variants': variant_features
}
integrated_features.append(combined_features)
return integrated_features
6. Quality Control and Validation
class GenomicsQualityControl:
def __init__(self):
self.qc_results = {}
def sequence_quality_check(self, sequences):
"""Check sequence quality metrics"""
qc_metrics = {}
for seq_id, sequence in sequences.items():
qc_metrics[seq_id] = {
'length': len(sequence),
'gc_content': (sequence.count('G') + sequence.count('C')) / len(sequence),
'n_content': sequence.count('N') / len(sequence),
'complexity': self.calculate_complexity(sequence)
}
return pd.DataFrame(qc_metrics).T
def calculate_complexity(self, sequence, window_size=50):
"""Calculate sequence complexity (entropy-based)"""
from collections import Counter
import math
complexities = []
for i in range(0, len(sequence) - window_size + 1, window_size):
window = sequence[i:i + window_size]
counts = Counter(window)
# Calculate Shannon entropy
entropy = 0
for count in counts.values():
p = count / len(window)
entropy -= p * math.log2(p)
complexities.append(entropy)
return np.mean(complexities)
def expression_quality_check(self, expression_df):
"""Check RNA-seq data quality"""
qc_results = {
'total_genes': len(expression_df),
'total_samples': len(expression_df.columns),
'zero_count_fraction': (expression_df == 0).sum().sum() / expression_df.size,
'median_expression_per_sample': expression_df.median(),
'highly_expressed_genes': (expression_df.mean(axis=1) > expression_df.mean(axis=1).quantile(0.9)).sum()
}
return qc_results
7. Dataset Creation for Different ML Tasks
Classification Dataset (Disease Prediction)
class ClassificationDatasetBuilder:
def __init__(self):
self.features = None
self.labels = None
def create_disease_prediction_dataset(self, genomic_features, clinical_data):
"""Create dataset for disease classification"""
# Combine genomic features
feature_matrix = []
sample_ids = []
for sample_id in genomic_features:
# Flatten all genomic features for this sample
sample_features = []
# Add sequence-based features
sample_features.extend(genomic_features[sample_id]['sequence_features'])
# Add variant features
sample_features.extend(genomic_features[sample_id]['variant_features'])
# Add expression features
sample_features.extend(genomic_features[sample_id]['expression_features'])
feature_matrix.append(sample_features)
sample_ids.append(sample_id)
# Get labels from clinical data
labels = [clinical_data[sid]['disease_status'] for sid in sample_ids]
return np.array(feature_matrix), np.array(labels), sample_ids
class RegressionDatasetBuilder:
def create_trait_prediction_dataset(self, genomic_features, trait_data, trait_name):
"""Create dataset for quantitative trait prediction"""
feature_matrix = []
trait_values = []
sample_ids = []
for sample_id in genomic_features:
if sample_id in trait_data and trait_name in trait_data[sample_id]:
sample_features = self.extract_sample_features(genomic_features[sample_id])
trait_value = trait_data[sample_id][trait_name]
feature_matrix.append(sample_features)
trait_values.append(trait_value)
sample_ids.append(sample_id)
return np.array(feature_matrix), np.array(trait_values), sample_ids
Sequence-to-Sequence Dataset (Variant Effect Prediction)
class SequenceToSequenceBuilder:
def create_variant_effect_dataset(self, reference_sequences, variants,
functional_scores):
"""Create dataset for predicting variant functional effects"""
dataset = []
for variant in variants:
chrom = variant['chromosome']
pos = variant['position']
ref = variant['ref_allele']
alt = variant['alt_allele']
# Extract reference sequence around variant
window_start = max(0, pos - 500)
window_end = pos + 500
ref_sequence = reference_sequences[chrom][window_start:window_end]
# Create alternative sequence
relative_pos = pos - window_start
alt_sequence = (ref_sequence[:relative_pos] +
alt +
ref_sequence[relative_pos + len(ref):])
# Get functional score if available
variant_key = f"{chrom}:{pos}:{ref}:{alt}"
score = functional_scores.get(variant_key, None)
if score is not None:
dataset.append({
'ref_sequence': ref_sequence,
'alt_sequence': alt_sequence,
'functional_score': score,
'variant_info': variant
})
return dataset
8. Production-Grade Data Pipeline
class ProductionGenomicsDataPipeline:
def __init__(self, config):
self.config = config
self.data_processors = {
'sequence': SequenceEncoder(),
'variants': VariantProcessor(),
'expression': RNAseqProcessor(),
'chip_seq': ChIPseqProcessor(),
'atac_seq': ATACseqProcessor()
}
def create_training_dataset(self, data_sources, task_type='classification'):
"""Complete pipeline to create ML-ready dataset"""
print("Step 1: Loading raw data...")
raw_data = self.load_all_data_sources(data_sources)
print("Step 2: Quality control...")
qc_results = self.run_quality_control(raw_data)
print("Step 3: Data preprocessing...")
processed_data = self.preprocess_data(raw_data)
print("Step 4: Feature engineering...")
features = self.engineer_features(processed_data)
print("Step 5: Dataset creation...")
if task_type == 'classification':
X, y, sample_ids = self.create_classification_dataset(features)
elif task_type == 'regression':
X, y, sample_ids = self.create_regression_dataset(features)
print("Step 6: Train/validation/test split...")
datasets = self.create_data_splits(X, y, sample_ids)
print("Step 7: Data validation...")
validation_results = self.validate_final_dataset(datasets)
return datasets, validation_results
def save_dataset(self, datasets, output_path):
"""Save processed dataset in multiple formats"""
# Save as HDF5 (efficient for large arrays)
with h5py.File(f"{output_path}/genomics_dataset.h5", 'w') as f:
for split_name, split_data in datasets.items():
group = f.create_group(split_name)
group.create_dataset('features', data=split_data['X'])
group.create_dataset('labels', data=split_data['y'])
group.create_dataset('sample_ids', data=split_data['sample_ids'])
# Save metadata
metadata = {
'dataset_info': self.config,
'feature_names': self.get_feature_names(),
'data_shapes': {k: v['X'].shape for k, v in datasets.items()},
'creation_date': pd.Timestamp.now().isoformat()
}
pd.Series(metadata).to_json(f"{output_path}/dataset_metadata.json")
# Usage example
config = {
'window_size': 1000,
'step_size': 500,
'min_expression_threshold': 1.0,
'variant_quality_threshold': 30,
'test_size': 0.2,
'validation_size': 0.1
}
pipeline = ProductionGenomicsDataPipeline(config)
9. Advanced Feature Engineering
class GenomicsFeatureEngineer:
def __init__(self):
self.feature_transformers = {}
def create_motif_features(self, sequences, motif_database):
"""Create transcription factor binding motif features"""
motif_scores = []
for sequence in sequences:
sequence_scores = []
for motif_name, motif_pwm in motif_database.items():
# Calculate motif match score
best_score = self.scan_motif(sequence, motif_pwm)
sequence_scores.append(best_score)
motif_scores.append(sequence_scores)
return np.array(motif_scores)
def create_evolutionary_features(self, sequences, conservation_scores):
"""Add evolutionary conservation features"""
conservation_features = []
for i, sequence in enumerate(sequences):
if i in conservation_scores:
# Average conservation score
avg_conservation = np.mean(conservation_scores[i])
# Maximum conservation score
max_conservation = np.max(conservation_scores[i])
# Conservation variance
conservation_var = np.var(conservation_scores[i])
conservation_features.append([
avg_conservation, max_conservation, conservation_var
])
else:
conservation_features.append([0, 0, 0]) # Unknown conservation
return np.array(conservation_features)
def create_chromatin_state_features(self, genomic_windows, chromatin_states):
"""Add chromatin state annotations"""
state_features = []
# Define chromatin states (e.g., from ChromHMM)
state_names = ['active_promoter', 'weak_promoter', 'active_enhancer',
'weak_enhancer', 'insulator', 'transcribed', 'repressed']
for window in genomic_windows:
chrom, start, end = window
window_states = np.zeros(len(state_names))
# Check overlap with chromatin states
if chrom in chromatin_states:
for state_idx, state_name in enumerate(state_names):
if self.overlaps_state(window, chromatin_states[chrom][state_name]):
window_states[state_idx] = 1
state_features.append(window_states)
return np.array(state_features)
10. Data Validation and Quality Metrics
class DatasetValidator:
def __init__(self):
self.validation_results = {}
def validate_dataset_integrity(self, X, y, sample_ids):
"""Comprehensive dataset validation"""
validation_results = {}
# Basic shape consistency
validation_results['shape_consistency'] = len(X) == len(y) == len(sample_ids)
# Missing value analysis
validation_results['missing_values'] = {
'features_missing_pct': np.isnan(X).sum() / X.size * 100,
'samples_with_missing': np.isnan(X).any(axis=1).sum(),
'features_with_missing': np.isnan(X).any(axis=0).sum()
}
# Label distribution
from collections import Counter
validation_results['label_distribution'] = dict(Counter(y))
# Feature statistics
validation_results['feature_stats'] = {
'mean_values': np.nanmean(X, axis=0),
'std_values': np.nanstd(X, axis=0),
'zero_variance_features': (np.nanvar(X, axis=0) == 0).sum()
}
# Data leakage checks
validation_results['duplicate_samples'] = len(sample_ids) - len(set(sample_ids))
return validation_results
def create_data_quality_report(self, validation_results, output_path):
"""Generate comprehensive data quality report"""
report = []
report.append("# Genomics Dataset Quality Report\n")
report.append(f"Generated: {pd.Timestamp.now()}\n\n")
# Dataset overview
report.append("## Dataset Overview\n")
report.append(f"- Total samples: {validation_results.get('total_samples', 'N/A')}\n")
report.append(f"- Total features: {validation_results.get('total_features', 'N/A')}\n")
report.append(f"- Missing values: {validation_results['missing_values']['features_missing_pct']:.2f}%\n\n")
# Quality issues
report.append("## Quality Issues\n")
if validation_results['duplicate_samples'] > 0:
report.append(f"⚠️ Found {validation_results['duplicate_samples']} duplicate samples\n")
if validation_results['feature_stats']['zero_variance_features'] > 0:
report.append(f"⚠️ Found {validation_results['feature_stats']['zero_variance_features']} zero-variance features\n")
# Save report
with open(f"{output_path}/quality_report.md", 'w') as f:
f.writelines(report)
This comprehensive framework covers everything from basic sequence processing to production-grade dataset creation. Key principles for genomics ML datasets:
- Clean, consistent data: Remove errors and standardize formats
- Proper quality control: Validate every step of processing
- Multi-modal integration: Combine different data types effectively
- Biological relevance: Maintain biological meaning in features
- Scalable pipelines: Handle large genomic datasets efficiently
- Reproducible workflows: Document and version everything
The framework handles the unique challenges of genomics data: massive scale, multiple data types, missing values, and the need for biological interpretation alongside machine learning performance.
[1] https://card.nih.gov/news-events/card-blog/introducing-genoml-future-genomic-machine-learning (opens in a new tab) [2] https://www.kaggle.com/code/nageshsingh/demystify-dna-sequencing-with-machine-learning (opens in a new tab) [3] https://pmc.ncbi.nlm.nih.gov/articles/PMC8365460/ (opens in a new tab) [4] https://www.reddit.com/r/bioinformatics/comments/1bx6zs8/can_i_train_an_rnndeep_neural_network_on_whole/ (opens in a new tab) [5] https://www.biorxiv.org/content/10.1101/2022.06.08.495248v1.full-text (opens in a new tab) [6] https://www.rapidinnovation.io/post/ai-agents-for-genomic-data-processing (opens in a new tab) [7] https://www.frontiersin.org/journals/bioinformatics/articles/10.3389/fbinf.2024.1457619/full (opens in a new tab) [8] https://www.cidrdb.org/cidr2015/Papers/CIDR15_Paper14u.pdf (opens in a new tab) [9] https://www.nature.com/articles/s42003-024-06161-1 (opens in a new tab) [10] https://sangerinstitute.blog/2025/04/01/top-ten-steps-to-get-your-genomics-data-ai-ready/ (opens in a new tab) [11] https://aws.amazon.com/what-is/genomic-data/ (opens in a new tab) [12] https://arxiv.org/abs/2306.16524 (opens in a new tab)