大家好!上一篇文章《关于仿站网站的个人开发心得-CSDN博客》发布,其实并没有说如何获取资源。今天这篇续作,我将详细介绍如何实际使用这个工具,包括安装步骤、依赖库说明、常见问题解决方法,以及一些实用技巧。
🛠 一、环境准备:需要安装哪些库?
在使用这个工具前,我们需要安装几个关键的Python库。下面我详细解释每个库的作用和必要性。
1. 核心依赖库
pip install requests beautifulsoup4 cssutils ttkbootstrap
库名和作用
requests
发送HTTP请求,获取网页内容
没有它就无法下载网页和资源
beautifulsoup4
解析HTML文档,提取元素
用于定位和修改img、link等标签
cssutils
解析CSS文件,提取url()引用
没有它就无法下载CSS中的背景图
ttkbootstrap
提供现代化GUI界面
替代原生tkinter,让界面更美观专业
2.可选(打包依赖库):
pip install pyinstaller
pyinstaller:将Python脚本打包成独立可执行文件为什么需要:让没有Python环境的用户也能使用
💡 小贴士:为什么选择这些库?
requests 比 urllib 更简洁易用,支持会话保持beautifulsoup4 比正则表达式解析HTML更可靠cssutils 是少数能正确解析CSS语法的库ttkbootstrap 解决了tkinter界面丑陋的问题,提供多种主题
🚀 二、安装与使用步骤(详细图解)
方法一:获取源码自己运行(适合想定制的用户)
# app_desktop.py - 网页模仿器
import os
import uuid
import re
import requests
from urllib.parse import urljoin, urlparse, unquote
from bs4 import BeautifulSoup
from tkinter import *
import ttkbootstrap as ttk
from ttkbootstrap.constants import *
from ttkbootstrap.scrolled import ScrolledText
import threading
import webbrowser
# ========== 配置 ==========
OUTPUT_DIR = 'output'
os.makedirs(OUTPUT_DIR, exist_ok=True)
# ========== 工具函数 ==========
def sanitize_filename(url):
"""将URL转为合法文件名"""
# 去除非字母数字字符
filename = re.sub(r'[^\w\s-]', '', unquote(url)).strip().replace(' ', '_')
return f"{filename[:50]}_{uuid.uuid4().hex[:8]}.html"
# ========== 资源下载函数(带大小限制)==========
def download_resource(session, url, base_url, save_dir):
try:
abs_url = urljoin(base_url, url)
parsed = urlparse(abs_url)
if not parsed.path or parsed.path == '/':
return None
# 解码 URL(处理 %20 等)
filename = os.path.basename(unquote(parsed.path))
if not filename or '.' not in filename:
try:
head = session.head(abs_url, timeout=5)
content_type = head.headers.get('content-type', '').lower()
if 'jpeg' in content_type or 'jpg' in content_type:
ext = '.jpg'
elif 'png' in content_type:
ext = '.png'
elif 'gif' in content_type:
ext = '.gif'
elif 'svg' in content_type:
ext = '.svg'
else:
ext = '.bin'
filename = f"res_{hash(abs_url) % 10000}{ext}"
except:
filename = f"res_{hash(abs_url) % 10000}.bin"
filename = re.sub(r'[^\w\.\-]', '_', filename)
filepath = os.path.join(save_dir, filename)
if os.path.exists(filepath):
return filename # 已存在则跳过
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'}
response = session.get(abs_url, headers=headers, timeout=10, stream=True)
response.raise_for_status()
# 检查文件大小(超过 10MB 跳过)
total_size = int(response.headers.get('content-length', 0))
if total_size > 10 * 1024 * 1024: # 10MB
return None
downloaded_size = 0
with open(filepath, 'wb') as f:
for chunk in response.iter_content(1024):
if chunk:
f.write(chunk)
downloaded_size += len(chunk)
if downloaded_size > 10 * 1024 * 1024: # 实时检查
os.remove(filepath)
return None
return filename
except Exception as e:
return None
# ========== 深度克隆函数(支持递归)==========
def clone_page_with_deep_assets(url, save_path, max_depth=1, current_depth=0, log_callback=None, downloaded_urls=None):
if downloaded_urls is None:
downloaded_urls = set()
if current_depth > max_depth:
return True
session = requests.Session()
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'}
try:
log_callback(f"{' ' * current_depth}🔍 请求: {url}")
response = session.get(url, headers=headers, timeout=10)
response.raise_for_status()
response.encoding = response.apparent_encoding
except Exception as e:
log_callback(f"❌ 请求失败: {url}, 错误: {e}")
return False
soup = BeautifulSoup(response.text, 'html.parser')
assets_dir = os.path.join(os.path.dirname(save_path), "assets")
os.makedirs(assets_dir, exist_ok=True)
# 下载图片
for img in soup.find_all("img", src=True):
src = img['src']
if src.lower().startswith(''):
continue # 跳过 base64
abs_url = urljoin(url, src)
if abs_url in downloaded_urls:
continue
downloaded_urls.add(abs_url)
local_file = download_resource(session, src, url, assets_dir)
if local_file:
img['src'] = f"assets/{local_file}"
log_callback(f"{' ' * current_depth}🖼️ 图片: {src} → {local_file}")
# 处理 CSS 及其背景图
for link in soup.find_all("link", rel="stylesheet", href=True):
href = link['href']
abs_url = urljoin(url, href)
if abs_url in downloaded_urls:
continue
downloaded_urls.add(abs_url)
local_file = download_resource(session, href, url, assets_dir)
if local_file:
link['href'] = f"assets/{local_file}"
log_callback(f"{' ' * current_depth}🎨 样式: {href} → {local_file}")
# 解析 CSS 中的 url(...)
css_path = os.path.join(assets_dir, local_file)
try:
with open(css_path, 'r', encoding='utf-8', errors='ignore') as f:
text = f.read()
urls = re.findall(r'url\([\'"]?(.*?)[\'"]?\)', text)
for bg_url in urls:
full_bg = urljoin(abs_url, bg_url.strip())
if full_bg in downloaded_urls:
continue
bg_file = download_resource(session, full_bg, url, assets_dir)
if bg_file:
log_callback(f"{' ' * (current_depth+1)}🖼️ CSS 图: {bg_url} → {bg_file}")
except:
pass
# 处理 JS
for script in soup.find_all("script", src=True):
src = script['src']
abs_url = urljoin(url, src)
if abs_url in downloaded_urls:
continue
downloaded_urls.add(abs_url)
local_file = download_resource(session, src, url, assets_dir)
if local_file:
script['src'] = f"assets/{local_file}"
script['defer'] = True # 延迟加载
log_callback(f"{' ' * current_depth}📜 脚本: {src} → {local_file}")
# 删除已知外链(Google、统计等)
for tag in soup.find_all(['link', 'script']):
src_or_href = tag.get('src') or tag.get('href')
if src_or_href:
low = src_or_href.lower()
if any(x in low for x in ['google', 'gstatic', 'analytics', 'facebook', 'jquery', 'bootstrap', 'cdn.']):
log_callback(f"🗑️ 移除外链: {src_or_href}")
tag.decompose()
# 保存 HTML
try:
with open(save_path, 'w', encoding='utf-8') as f:
f.write(str(soup)) # 不 prettify() 提升速度
log_callback(f"✅ 成功保存: {os.path.basename(save_path)}")
return True
except Exception as e:
log_callback(f"❌ 保存失败: {e}")
return False
# ========== GUI 主程序(美化版)==========
class WebMirrorApp:
def __init__(self, root):
self.root = root
self.root.title("🚀 自研快速仿站神器 - 输入网址一键生成HTML")
self.root.geometry("800x600")
self.root.resizable(True, True)
self.root.iconbitmap('favicon.ico') if os.path.exists('favicon.ico') else None
# 标题区
title_frame = ttk.Frame(root)
title_frame.pack(pady=15, fill=X, padx=20)
ttk.Label(
title_frame,
text="🌐 自研快速仿站神器",
font=("微软雅黑", 16, "bold"),
bootstyle="primary"
).pack()
ttk.Label(
title_frame,
text="输入网址,一键克隆网页(含图片/CSS/JS),生成可离线浏览的HTML文件",
font=("微软雅黑", 9),
foreground="#666"
).pack()
# 输入区
input_frame = ttk.Frame(root)
input_frame.pack(pady=10, fill=X, padx=20)
self.url_entry = ttk.Entry(input_frame, font=("微软雅黑", 11), width=50)
self.url_entry.pack(side=LEFT, padx=5, expand=True, fill=X)
self.url_entry.insert(0, "https://example.com")
self.start_btn = ttk.Button(
input_frame,
text="⚡ 一键仿站",
command=self.start_crawl,
bootstyle="success-outline",
width=12
)
self.start_btn.pack(side=LEFT, padx=5)
# 日志区
log_frame = ttk.Labelframe(root, text="运行日志", padding=10)
log_frame.pack(fill=BOTH, expand=True, padx=20, pady=10)
self.log_text = ScrolledText(log_frame, font=("Consolas", 9), height=20)
self.log_text.text.config(bg="#f8f9fa", fg="#333")
self.log_text.pack(fill=BOTH, expand=True)
# 底部按钮
btn_frame = ttk.Frame(root)
btn_frame.pack(fill=X, padx=20, pady=10)
ttk.Button(btn_frame, text="📁 打开输出目录", command=self.open_output, bootstyle="info-link").pack(side=RIGHT, padx=5)
ttk.Button(btn_frame, text="📘 使用说明", command=self.show_help, bootstyle="light-link").pack(side=RIGHT, padx=5)
def log(self, msg):
self.log_text.text.insert(END, "> " + msg + "\n")
self.log_text.text.see(END)
self.root.update_idletasks()
def start_crawl(self):
self.log("🔘 按钮已点击")
url = self.url_entry.get().strip()
if not url:
self.log("❌ 请输入有效网址")
return
if not url.startswith(('http://', 'https://')):
url = 'https://' + url
filename = sanitize_filename(url)
filepath = os.path.join(OUTPUT_DIR, filename)
self.start_btn.config(state=DISABLED, text="⏳ 执行中...")
self.log(f"🔧 目标: {url}")
self.log(f"🔧 保存: {filepath}")
def task():
try:
self.log("🔧 正在调用克隆函数...")
success = clone_page_with_deep_assets(
url=url,
save_path=filepath,
max_depth=1,
log_callback=self.log
)
if success:
self.log("🎉 仿站完成!")
else:
self.log("💔 仿站失败")
except Exception as e:
self.log(f"💥 程序崩溃: {e}")
finally:
self.start_btn.config(state=NORMAL, text="⚡ 一键仿站")
thread = threading.Thread(target=task, daemon=True)
thread.start()
def open_output(self):
if os.path.exists(OUTPUT_DIR):
os.startfile(OUTPUT_DIR)
def show_help(self):
help_text = """
【使用说明】
1. 输入目标网址(支持 http/https)
2. 点击「一键仿站」
3. 等待日志显示完成
4. 文件保存在「output」文件夹
5. 双击 HTML 文件可离线查看
【注意事项】
- 部分网站有反爬机制,可能失败
- 图片/CSS/JS 会自动下载并替换路径
- 生成的网页可完全离线使用
- 建议用于学习或备份,遵守网站协议
"""
win = Toplevel(self.root)
win.title("📘 使用说明")
win.geometry("550x450")
win.transient(self.root)
win.grab_set()
text = Text(win, wrap=WORD, padx=15, pady=15, font=("微软雅黑", 9))
text.insert(END, help_text)
text.config(state=DISABLED)
text.pack(fill=BOTH, expand=True)
ttk.Button(win, text="关闭", command=win.destroy, bootstyle="primary").pack(pady=10)
# ========== 主程序入口 ==========
if __name__ == '__main__':
app = ttk.Style(theme='cosmo') # 可选: flatly, darkly, lux, morph, cosmo
root = app.master
WebMirrorApp(root)
root.mainloop()
如果你觉得这个项目对你有帮助,欢迎打赏支持我持续更新!
可凭借打赏截图私信获取进一步的技术交流和使用视频!
并获取已经打包好的exe文件!
Copyright © 2022 北智游戏学院 - 活动攻略与新手教学 All Rights Reserved.