import os, re, json, time, random, hashlib
from urllib.parse import urlparse
import requests
from tqdm import tqdm

CDX_API="https://web.archive.org/cdx/search/cdx"
WB="https://web.archive.org/web"
DOMAIN="bilibili.163.com"
CUTOFF_FROM="19960101"
CUTOFF_TO="20211130"  # 严格“11月以前”改成 20211031

OUTDIR=f"{DOMAIN}-wayback-{CUTOFF_TO}"
MANIFEST=os.path.join(OUTDIR,"_manifest.jsonl")
FAILED=os.path.join(OUTDIR,"_failed.jsonl")

SLEEP_BASE=1.2
SLEEP_JITTER=0.6
MAX_RETRIES=4

HEADERS={
  "User-Agent":"Archive-Reconstruction/1.0 (non-commercial; cultural preservation)",
  "Accept":"*/*",
}

os.makedirs(OUTDIR, exist_ok=True)

def qhash(s): return hashlib.sha1(s.encode()).hexdigest()[:12]

def safe_relpath(u):
  p=urlparse(u)
  path=p.path or "/"
  if path.endswith("/"): path+="index.html"
  path=re.sub(r'[:*?"<>|]',"_",path)
  if p.query:
    b,e=os.path.splitext(path)
    path=f"{b}__q_{qhash(p.query)}{e}"
  return path.lstrip("/")

def wb_url(ts,u,mode): return f"{WB}/{ts}{mode}/{u}"

def write_jsonl(p,o):
  with open(p,"a",encoding="utf-8") as f: f.write(json.dumps(o,ensure_ascii=False)+"\n")

def polite_sleep(): time.sleep(SLEEP_BASE+random.random()*SLEEP_JITTER)

def cdx_num_pages(ps):
  r=requests.get(CDX_API,params={
    "url":f"{DOMAIN}/*","from":CUTOFF_FROM,"to":CUTOFF_TO,
    "showNumPages":"true","pageSize":str(ps)
  },headers=HEADERS,timeout=60)
  r.raise_for_status(); return int(r.text.strip())

def cdx_fetch_page(p,ps):
  r=requests.get(CDX_API,params={
    "url":f"{DOMAIN}/*","from":CUTOFF_FROM,"to":CUTOFF_TO,
    "output":"json","fl":"timestamp,original,mimetype,statuscode,digest,length",
    "page":str(p),"pageSize":str(ps),
    "collapse":"urlkey","sort":"reverse","filter":"statuscode:200"
  },headers=HEADERS,timeout=120)
  r.raise_for_status(); d=r.json(); return d[1:]

def fetch_stream(u): return requests.get(u,headers=HEADERS,timeout=120,stream=True)

def download_one(rec):
  ts,u,mt,sc,dg,ln=rec
  rel=safe_relpath(u)
  out=os.path.join(OUTDIR,rel)
  os.makedirs(os.path.dirname(out),exist_ok=True)
  if os.path.exists(out) and os.path.getsize(out)>0: return True,"skip",None,rel
  for mode in ("id_","im_"):
    url=wb_url(ts,u,mode)
    for i in range(1,MAX_RETRIES+1):
      try:
        r=fetch_stream(url)
        if r.status_code in (429,502,503,504):
          time.sleep(min(60,(2**(i-1))*2)+random.random()); continue
        if r.status_code!=200: break
        tmp=out+".part"
        with open(tmp,"wb") as f:
          for c in r.iter_content(131072):
            if c: f.write(c)
        if os.path.getsize(tmp)==0: os.remove(tmp); break
        os.replace(tmp,out)
        with open(out+".meta.json","w",encoding="utf-8") as mf:
          json.dump({"timestamp":ts,"original":u,"wayback_url":url,"mimetype":mt,
                     "statuscode":sc,"digest":dg,"length":ln,"mode":mode,"saved_path":rel},
                    mf,ensure_ascii=False,indent=2)
        return True,mode,None,rel
      except Exception as e:
        time.sleep(1+random.random())
  return False,"failed","error",rel

def main():
  ps=5; pages=cdx_num_pages(ps)
  print(f"[OUT] {os.path.abspath(OUTDIR)}")
  tot=ok=0
  for p in range(pages):
    for rec in tqdm(cdx_fetch_page(p,ps),leave=False):
      tot+=1
      ok1,why,err,rel=download_one(rec)
      ts,u,mt,sc,dg,ln=rec
      write_jsonl(MANIFEST,{"ok":ok1,"why":why,"err":err,"timestamp":ts,"original":u,
                            "mimetype":mt,"statuscode":sc,"digest":dg,"length":ln,
                            "saved_path":rel})
      if not ok1: write_jsonl(FAILED,{"original":u,"timestamp":ts,"err":err})
      else: ok+=1
      polite_sleep()
    print(f"[PROGRESS] {p+1}/{pages} ok={ok} total={tot}")
  print(f"[DONE] ok={ok} total={tot}")

if __name__=="__main__": main()
