Python File seek + write output weird "NUL" in the file -
i'm writting downloader split url parts , download threading, not use "join" because join = unable stream (cannot write file if thread not finish)
but problem f.seek , write output weird file, content of file have "nul" character (in notepad++) , text in file 1/3 of whole file.
he everybody, thank helping me, here version 2.0 of code, thank padraic cunningham suggestion , exlaination, fix code you've suggested: please me check code, , think need guy convert http.server file streamming method:
import os, requests import threading import urllib3 import urllib.request, urllib.error, urllib.parse import time import re pool = urllib3.poolmanager(maxsize=10) url = "https://raw.githubusercontent.com/langpavel/tampermonkey/master/src/emulation.js" filename = "1.js" countsize = 0 #if os.path.exists(filename): # os.remove(filename) def defwrite(filename,data,offset): f = open(filename,'wb') f.seek(offset) f.write(data) f.close() def buildrange(url, numsplits): global pool value = int(requests.head(url, headers={'accept-encoding': 'identity'}).headers.get('content-length', none)) print("fullsize: ", value) print("try devide 3 :", value / 3) lst = [] in range(numsplits): if == range(numsplits): lst.append('%s-%s' % (i * value//numsplits + 1, * value//numsplits + 1 + (value - (i * value//numsplits + 1)))) if == 0: lst.append('%s-%s' % (0, value//numsplits)) else: lst.append('%s-%s' % (i * value//numsplits + 1, (i + 1) * value//numsplits)) return lst def main(url=none, splitby=3): global filename, pool, countsize start_time = time.time() if not url: print("please enter url begin download.") return #filename = "1.jpg" #print("%s bytes download." % sizeinbytes) # if not sizeinbytes: # print("size cannot determined.") # return #sinzeinbytes = buildrange(url, datadict = {} f = open(filename,'wb') # split total num bytes ranges #ranges = buildrange(url,int(sizeinbytes), splitby) ranges = buildrange(url, splitby) print(ranges) def downloadchunk(idx, irange): print(idx) #time.sleep(1*idx) #req = urllib.request.request(url) #req.headers['range'] = 'bytes={}'.format(irange) headers = urllib3._collections.httpheaderdict() headers.add('range', 'bytes=' + str(irange)) data = pool.urlopen('get', url, headers=headers).data #print(data) #print("finish: " + str(irange)) offset = int(re.sub("(^.*?)-(.*?)$", "\\1", irange)) print(offset) # print(irange) f.seek(offset, 0) #f.truncate(0) #print(f.tell()) f.write(data) #f.read() #f.close() countsize = countsize + offset #defwrite("1.txt", req, re.sub("(^.*?)-", "\\1", str(irange))) # create 1 downloading thread per chunk downloaders = [ threading.thread( target=downloadchunk, args=(idx, irange), ) idx,irange in enumerate(ranges) ] # start threads, let run in parallel, wait finish th in downloaders: th.start() #th.isalive() #for th in downloaders: #th.join() #print(th.join) print(countsize) #print('done: got {} chunks, total {} bytes'.format( # len(datadict), sum( ( ## len(chunk) chunk in list(datadict.values()) # ) ) #)) #print("--- %s seconds ---" % str(time.time() - start_time)) # if os.path.exists(filename): # os.remove(filename) #reassemble file in correct order #with open(filename, 'wb') fh: # _idx,chunk in sorted(datadict.items()): # fh.write(chunk) #stream_chunk = 16 * 1024 #with open(filename, 'wb') fp: # while true: # _idx,chunk in sorted(datadict.items()): #fh.write(chunk) # chunking = chunk.read(stream_chunk) # if not chunk: # break # fp.write(chunking) # print("finished writing file %s" % filename) #print('file size {} bytes'.format(os.path.getsize(filename))) if __name__ == '__main__': if os.path.exists(filename): os.remove(filename) main(url, splitby=16)
here code, please me fix it: version 1.0, ignore it, version 2.0 above:
import os, requests import threading import urllib3 import urllib.request, urllib.error, urllib.parse import time import re pool = urllib3.poolmanager(maxsize=10) url = "https://raw.githubusercontent.com/langpavel/tampermonkey/master/src/emulation.js" filename = "1.js" #if os.path.exists(filename): # os.remove(filename) def defwrite(filename,data,offset): f = open(filename,'wb') f.seek(offset) f.write(data) f.close() def buildrange(value, numsplits): lst = [] in range(numsplits): if == range(numsplits): lst.append('%s-%s' % (int(round(1 + * value/(numsplits*1.0),0)), int(value - round(1 + * value/(numsplits*1.0) + value/(numsplits*1.0)-1, 0)))) if == 0: lst.append('%s-%s' % (i, int(round(1 + * value/(numsplits*1.0) + value/(numsplits*1.0)-1, 0)))) else: lst.append('%s-%s' % (int(round(1 + * value/(numsplits*1.0),0)), int(round(1 + * value/(numsplits*1.0) + value/(numsplits*1.0)-1, 0)))) return lst def main(url=none, splitby=3): global filename, pool start_time = time.time() if not url: print("please enter url begin download.") return #filename = "1.jpg" sizeinbytes = requests.head(url, headers={'accept-encoding': 'identity'}).headers.get('content-length', none) print("%s bytes download." % sizeinbytes) if not sizeinbytes: print("size cannot determined.") return datadict = {} # split total num bytes ranges ranges = buildrange(int(sizeinbytes), splitby) def downloadchunk(idx, irange): print(idx) #req = urllib.request.request(url) #req.headers['range'] = 'bytes={}'.format(irange) headers = urllib3._collections.httpheaderdict() headers.add('range', 'bytes=' + str(irange)) data = pool.urlopen('get', url, headers=headers).data print(data) print("finish: " + str(irange)) offset = int(re.sub("(^.*?)-(.*?)$", "\\1", irange)) #print(offset) # print(irange) f = open(filename,'wb') f.seek(offset) #f.truncate(0) #print(f.tell()) f.write(data) #f.read() #f.close() #defwrite("1.txt", req, re.sub("(^.*?)-", "\\1", str(irange))) # create 1 downloading thread per chunk downloaders = [ threading.thread( target=downloadchunk, args=(idx, irange), ) idx,irange in enumerate(ranges) ] # start threads, let run in parallel, wait finish th in downloaders: th.start() #th.isalive() #for th in downloaders: #th.join() #print(th.join) #print('done: got {} chunks, total {} bytes'.format( # len(datadict), sum( ( ## len(chunk) chunk in list(datadict.values()) # ) ) #)) #print("--- %s seconds ---" % str(time.time() - start_time)) # if os.path.exists(filename): # os.remove(filename) #reassemble file in correct order #with open(filename, 'wb') fh: # _idx,chunk in sorted(datadict.items()): # fh.write(chunk) #stream_chunk = 16 * 1024 #with open(filename, 'wb') fp: # while true: # _idx,chunk in sorted(datadict.items()): #fh.write(chunk) # chunking = chunk.read(stream_chunk) # if not chunk: # break # fp.write(chunking) # print("finished writing file %s" % filename) #print('file size {} bytes'.format(os.path.getsize(filename))) if __name__ == '__main__': main(url, splitby=3)
you use 3 threads target function downloadchunk
, open file 3 times using wb
overwrites 1/3 of content. call seek no apparent reason. if wanted append file open using a
each time or open file once outside functions. trying seek using empty file , write null bytes come from.
if want open file reading , writing can seek line buffering:
open("whatever.file", "r+b",buffering=1) f
then use file write to, don't keep opening in function , overwriting, file must exist.
Comments
Post a Comment