CSV Validation Automation
CSV File Validation Automation for Data Engineers
Stop Writing Custom Validation Scripts
# 47 lines of custom validation code for each vendor
import pandas as pd
import re
from datetime import datetime
def validate_vendor_a_csv(file_path):
df = pd.read_csv(file_path)
errors = []
# Check required columns exist
required_cols = ['customer_id', 'amount', 'date']
missing_cols = [col for col in required_cols if col not in df.columns]
if missing_cols:
errors.append(f"Missing columns: {missing_cols}")
# Validate customer_id format
invalid_ids = df[~df['customer_id'].str.match(r'^[A-Z]{2}\d{6}$', na=False)]
if not invalid_ids.empty:
errors.append(f"Invalid customer IDs on rows: {invalid_ids.index.tolist()}")
# Validate amounts are positive numbers
invalid_amounts = df[df['amount'] <= 0]
if not invalid_amounts.empty:
errors.append(f"Invalid amounts on rows: {invalid_amounts.index.tolist()}")
# Validate date format
try:
pd.to_datetime(df['saledate'])
except:
errors.append("Invalid date format")
# ... 30+ more lines for edge cases, encoding, duplicates, etc.
if errors:
raise ValueError("\n".join(errors))
return dfStop Manually Handling These Common Vendor CSV Problems
Every Week You Delay Costs More Manual Hours
Already Trusted by Data Teams
Ready to Automate Your CSV Chaos?
Last updated